In [3]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import math
import pylab as pl
import scipy.signal as signal
import pywt
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split    

## 加载数据文件

In [4]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    
}


def readFile(filename):
    return np.loadtxt(filename)

In [5]:
folderPath = 'dataset1/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals[:,:,0].shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])
print(anomalyType)
dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (1248, 50)
[1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

## 时域特征提取

In [21]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    entropy_list = []
    i = 0
    while i < 9:
        i = i + 1
        entropy_list.append(entropy)
        
    return entropy_list

def calculate_fourierCoefficient(x):
    N = len(x)
    real = [0]*10
    imag = [0]*10
    result = [0]*10
    n =0
    i = 0
    k = 0
    while k < 10:
        while n < N:
            real[k] = real[k] + x[n] * np.cos(2*np.pi*(k+40)*n/N) 
            imag[k] = imag[k] - x[n] * np.sin(2*np.pi*(k+40)*n/N) 
            n = n + 1
        n = 0
        k = k + 1
    while i < 10:
        result[i] = np.sqrt(real[i]*real[i]+imag[i]*imag[i])
        i = i + 1
    return real

def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    min = np.min(list_values)
    max = np.max(list_values)
    return [n5, n25, n75, n95, median, mean, std, var, rms, min, max]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings+no_mean_crossings]


def get_features(list_values):
    entropy_list = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    fourierCoefficient = calculate_fourierCoefficient(list_values)
    return statistics + fourierCoefficient


def extract_features(dataset):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, 3):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features(signal)
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [22]:
features = extract_features(signals)

print(features.shape)

(1248, 63)


In [23]:
labels = np.array(anomalyType)
print(labels)

[1. 1. 0. ... 1. 1. 1.]


In [24]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[-0.04168651, -0.0106496 ,  0.00762513, ...,  0.20147906,
         0.03975483,  0.03045766],
       [-0.04671516, -0.01143882,  0.00878131, ..., -0.07826772,
         0.05390477,  0.00931587],
       [-0.03249816, -0.01057153,  0.0114071 , ..., -0.04529363,
        -0.06726094, -0.02124565],
       ...,
       [-0.02737561, -0.00513955,  0.00744782, ..., -0.18308674,
         0.11055013,  0.03821252],
       [-0.02480815, -0.00921395,  0.0100039 , ...,  0.19214241,
         0.15870465,  0.06861833],
       [-0.02332248, -0.01483977,  0.01337723, ..., -0.24813904,
         0.01066694, -0.01021894]])

## 信号分类

In [25]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [11]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.8687285223367688
Accuracy on test set is : 0.8554133333333334
              precision    recall  f1-score   support

         0.0      0.862     0.900     0.881     22286
         1.0      0.844     0.790     0.816     15214

    accuracy                          0.855     37500
   macro avg      0.853     0.845     0.848     37500
weighted avg      0.855     0.855     0.855     37500



In [43]:
# SVM Classifier using cross validation

def svm_cross_validation(train_x, train_y):

    from sklearn.model_selection import GridSearchCV

    from sklearn.svm import SVC

    model = SVC(kernel='rbf', probability=True)

    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}

    grid_search = GridSearchCV(model, param_grid, n_jobs = 8, verbose=1)

    grid_search.fit(train_x, train_y)

    best_parameters = grid_search.best_estimator_.get_params()

    for para, val in list(best_parameters.items()):

        print(para, val)

    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)

    model.fit(train_x, train_y)

    return model


### Support Vector Machine

In [29]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=5)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.88139747995418
Accuracy on test set is : 0.7393866666666665
              precision    recall  f1-score   support

         0.0      0.732     0.881     0.800     22160
         1.0      0.757     0.535     0.627     15340

    accuracy                          0.739     37500
   macro avg      0.745     0.708     0.713     37500
weighted avg      0.742     0.739     0.729     37500



### Random Forest

In [18]:

yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of RandomForest
Accuracy on training set is : 1.0
Accuracy on test set is : 0.81104
              precision    recall  f1-score   support

         0.0      0.814     0.884     0.848     22328
         1.0      0.805     0.703     0.751     15172

    accuracy                          0.811     37500
   macro avg      0.810     0.794     0.799     37500
weighted avg      0.811     0.811     0.809     37500



## 跨数据集

### LR

In [13]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Logistic Reg")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Logistic Reg
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.868     0.868     0.868     10831
         1.0      0.810     0.811     0.810      7534

    accuracy                          0.844     18365
   macro avg      0.839     0.839     0.839     18365
weighted avg      0.844     0.844     0.844     18365

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.823     0.949     0.882     11340
         1.0      0.905     0.703     0.791      7795

    accuracy                          0.849     19135
   macro avg      0.864     0.826     0.837     19135
weighted avg      0.856     0.849     0.845     19135



### SVM

In [42]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore=0
testingScore_poor, testingScore_bad = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC(C=5)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

   
   

print("Results of Support Vector Machine")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Support Vector Machine
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.838     0.884     0.860     10956
         1.0      0.816     0.751     0.782      7520

    accuracy                          0.830     18476
   macro avg      0.827     0.818     0.821     18476
weighted avg      0.829     0.830     0.829     18476

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.786     0.922     0.848     11305
         1.0      0.847     0.631     0.724      7719

    accuracy                          0.804     19024
   macro avg      0.816     0.777     0.786     19024
weighted avg      0.811     0.804     0.798     19024



### RF

In [15]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf =RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of RandomForest")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of RandomForest
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.853     0.842     0.848     10693
         1.0      0.781     0.795     0.788      7556

    accuracy                          0.823     18249
   macro avg      0.817     0.819     0.818     18249
weighted avg      0.823     0.823     0.823     18249

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.809     0.930     0.866     11388
         1.0      0.871     0.682     0.765      7863

    accuracy                          0.829     19251
   macro avg      0.840     0.806     0.815     19251
weighted avg      0.835     0.829     0.825     19251

