In [56]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pywt
import scipy
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

## Dataset

In [57]:
def read_signal(filename):
    with open(filename, 'r') as fp:
        data = fp.read().splitlines()
        data = map(lambda x: x.rstrip().lstrip().split(), data)
        data = [list(map(float, line)) for line in data]
    return data

def read_label(filename):        
    with open(filename, 'r') as fp:
        labels = fp.read().splitlines()
        labels = list(map(int, labels))
    return labels

def read_data():
    train_folder = 'train_data/'
    test_folder = 'test_data/'
    train_signals, test_signals = [], []
    train_labels, test_labels = [], []
    
    for input_file in sorted(os.listdir(train_folder)):
        if not input_file.startswith('.'):
            signal = read_signal(train_folder + input_file)
            train_signals.append(signal)
    train_signals = np.transpose(np.array(train_signals), (1, 2, 0))
    for input_file in sorted(os.listdir(test_folder)):
        if not input_file.startswith('.'):
            signal = read_signal(test_folder + input_file)
            test_signals.append(signal)
    test_signals = np.transpose(np.array(test_signals), (1, 2, 0))
    
    train_labels = read_label('y_train.txt')
    test_labels = read_label('y_test.txt')
    
    return train_signals, train_labels, test_signals, test_labels
    # (7352*128*n), (7352*1); (2947*128*n), (2947*1)

## Features

In [58]:
def features(signal):
    mean = np.nanmean(signal)
    std = np.nanstd(signal)
    maxval = np.nanmax(signal)
    minval = np.nanmin(signal)
    maxidx = np.nanargmax(signal)
    minidx = np.nanargmin(signal)
    m_deri = np.mean(np.gradient(signal)) 
    no_zero_crossings = len(np.nonzero(np.diff(np.array(signal) > 0))[0])
    no_mean_crossings = len(np.nonzero(np.diff(np.array(signal) > np.nanmean(signal)))[0])  
    return [mean, std, maxval, minval, maxidx, minidx, m_deri, no_zero_crossings, no_mean_crossings]

## Data processing

In [59]:
def extract_feature(dataset, waveletname):
    win_feature = []
    # num of windows (2.56 s/window)
    for win_no in range(0, len(dataset)):    
        file_feature = []
        # num of files
        for file_no in range(0, dataset.shape[2]):  
            
            segment = dataset[win_no, :, file_no]    # raw data (128 elements/seg)
            
            # DWT (baseline)
            dwt_data = pywt.wavedec(segment, waveletname)        # select all components
            #dwt_data = pywt.wavedec(segment, waveletname)[0:2]  # select the deepest level
            
            # feature extraction
            for level in dwt_data:
                file_feature += features(level)
                
        win_feature.append(file_feature)
    
    feature = np.array(win_feature)
    return feature

In [60]:
train_data, train_label, test_data, test_label = read_data()
train_label = np.reshape(np.array(train_label).T, (-1, 1))
print(train_data.shape[0], train_data.shape[1], train_data.shape[2])
print(test_data.shape[0], test_data.shape[1], test_data.shape[2])

train_data = extract_feature(train_data, 'db4')
test_data  = extract_feature(test_data, 'db4')
#train_data = extract_feature(train_data, 'coif5')
#test_data  = extract_feature(test_data, 'coif5')
print(train_data.shape[0], train_data.shape[1])
print(test_data.shape[0], test_data.shape[1])

7352 128 9
2947 128 9
7352 1
7352 162
2947 162


In [46]:
# for test on Weka
np.savetxt('data.txt', np.concatenate((train_data, train_label), axis=1), delimiter=',') 

## Machine learning

### SVM

In [6]:
model = svm.SVC()
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 0.8665669205658324
Accuracy on test set: 0.7838479809976246


### Gradient Boosting

In [7]:
model = GradientBoostingClassifier(n_estimators=1000)
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 1.0
Accuracy on test set: 0.832371903630811


### Random Forest

In [8]:
model = RandomForestClassifier()
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 1.0
Accuracy on test set: 0.8215134034611469


### Naive Bayes

In [9]:
model = GaussianNB()
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 0.8071273122959739
Accuracy on test set: 0.7387173396674585


### Logistic Regression

In [10]:
model = LogisticRegression(max_iter = 10000)
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 0.8803046789989118
Accuracy on test set: 0.8137088564642009


### Decision Tree

In [11]:
model = DecisionTreeClassifier()
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 1.0
Accuracy on test set: 0.7522904648795385


### KNN

In [12]:
model = KNeighborsClassifier()
model.fit(train_data, train_label)
train_score = model.score(train_data, train_label)
test_score = model.score(test_data, test_label)
print("Accuracy on training set: {}".format(train_score))
print("Accuracy on test set: {}".format(test_score))

Accuracy on training set: 0.7969260065288357
Accuracy on test set: 0.6359009161859518
