In [2]:
import imblearn
print(imblearn.__version__)

0.4.3




In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# load dataset
from numpy import genfromtxt
import os
from pandas import DataFrame
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from sklearn import svm
from collections import Counter
from imblearn.over_sampling import SMOTE

# load a list of files
def load_group(filenames, prefix=''):
    loaded = list()
    for name in filenames:
        data = genfromtxt(prefix + name, delimiter=',')
        if 'x' in name:
            loaded.append(data)
        if 'y' in name:
            xData = genfromtxt(prefix + name[:-5] + 'x.csv', delimiter=',')
            interpolated_data = nn_interpolate(data, (xData.shape[0], xData.shape[1]))
            loaded.append(interpolated_data)
    return loaded


def load_dataset(group, prefix=''):
    filenamesX = list()
    filenamesY = list()
    # body acceleration
    for file in os.listdir(prefix):
        if file.startswith(group) and file.endswith('__x.csv'):
            filenamesX += [file]
    # load input data
    X = load_group(filenamesX, prefix)
    # load class output
    for file in os.listdir(prefix):
        if file.startswith(group) and file.endswith('__y.csv'):
            filenamesY += [file]

    y = load_group(filenamesY, prefix)
    return X, y


def nn_interpolate(A, new_size):
    """Vectorized Nearest Neighbor Interpolation"""

    old_size = A.shape
    row_ratio, col_ratio = np.array(new_size) / np.array(old_size)

    # row wise interpolation
    row_idx = (np.ceil(range(1, 1 + int(old_size[0] * row_ratio)) / row_ratio) - 1).astype(int)

    final_matrix = A[row_idx]

    return final_matrix


# # summarize the balance of classes in an output variable column
# def class_breakdown(data):
#     # convert the numpy array into a dataframe
#     df = DataFrame(data)
#     # group data by the class value and calculate the number of rows
#     counts = df.groupby(0).size()
#     # retrieve raw rows
#     counts = counts.values
#     # summarize
#     for i in range(len(counts)):
#         percent = counts[i] / len(df) * 100
#         print('Class=%d, total=%d, percentage=%.3f' % (i + 1, counts[i], percent))

def pre_processing(X):
    len_sequences = []
    for one_file in X:
        for one_seq in one_file:
            len_sequences.append(len(one_seq))

    to_pad = 70172
    new_seq = []
    for one_file in X:
        for one_seq in one_file:
            len_one_seq = len(one_seq)
            last_val = one_seq[-1]
            n = to_pad - len_one_seq

            to_concat = np.repeat(one_seq[-1], n).reshape(6, n).transpose()
            new_one_seq = np.concatenate([one_seq, to_concat])
            new_seq.append(new_one_seq)

        final_seq = np.stack(new_seq)
    seq_len = 60000
    final_seq = sequence.pad_sequences(final_seq, maxlen=seq_len, padding='post', dtype='float',
                                       truncating='post')

def data_finalize(trainX, trainy, valX, valY):
    # train, train_target, validation, val_target = [], [], [], []
    trainSubject, trainTarget, validationSubject, validationTarget = [], [], [], []
    for i in range(0, len(trainX)):
        trainSubject.append(np.vstack(trainX[i]))

    train = np.vstack(trainSubject)

    for i in range(0, len(trainy)):
        trainTarget.append(np.concatenate(trainy[i]))

    train_target = np.concatenate(trainTarget)

    for i in range(0, len(valX)):
        validationSubject.append(np.vstack(valX[i]))

    validation = np.vstack(validationSubject)

    for i in range(0, len(valY)):
        validationTarget.append(np.concatenate(valY[i]))

    val_target = np.concatenate(validationTarget)

    return train, train_target, validation, val_target

def create_MLmodel(trainX, trainy, valX, valY):
    clf = svm.SVC()
    clf.fit(trainX, trainy)

# load all train
trainX = list()
trainy = list()
validationX = list()
validationY = list()
for i in range(1, 9):
    trX, trY = load_dataset('subject_00' + str(i), '/content/gdrive/MyDrive/data/')
    trainX.append(trX)
    trainy.append(trY)
for i in range(6, 9):
    valX, valY = load_dataset('subject_00' + str(i), '/content/gdrive/MyDrive/data/')
    validationX.append(valX)
    validationY.append(valY)
train, train_target, val, val_target = data_finalize(trainX, trainy, validationX, validationY)
train = train[:-2, :]




In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

counter = Counter(train_target)
print(counter)

oversample = SMOTE()
Xsmote, Ysmote = oversample.fit_resample(train, train_target)

counter = Counter(Ysmote)
print(counter)

# trainX, testX, trainy, testy = train_test_split(train, train_target, test_size=0.5, stratify=y)
model = LogisticRegression(solver='liblinear')
# print(trainX.shape)
# print(train_target.shap)
model.fit(Xsmote, Ysmote)
yhat = model.predict(val)

Counter({0.0: 1006926, 3.0: 206434, 2.0: 73068, 1.0: 55216})




Counter({0.0: 1006926, 1.0: 1006926, 2.0: 1006926, 3.0: 1006926})


In [7]:
print('Accuracy: %.3f' % accuracy_score(val_target, yhat))
print('Precision: %.3f' % precision_score(val_target, yhat, average='micro'))
print('Recall: %.3f' % recall_score(val_target, yhat, average='micro'))
print('F-measure: %.3f' % f1_score(val_target, yhat, average='micro'))

Accuracy: 0.357
Precision: 0.357
Recall: 0.357
F-measure: 0.357


In [4]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# SVC_model = svm.SVC()
KNN_model = KNeighborsClassifier(n_neighbors=5)
# SVC_model.fit(train, train_target)
KNN_model.fit(Xsmote, Ysmote)
# print(accuracy_score(SVC_prediction, y_test))


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [5]:
KNN_prediction = KNN_model.predict(val)
print(accuracy_score(KNN_prediction, val_target))
# But Confusion Matrix and Classification Report give more details about performance
# print(confusion_matrix(SVC_prediction, y_test))
print(classification_report(KNN_prediction, val_target))

0.7195762299589661
              precision    recall  f1-score   support

         0.0       0.64      0.99      0.78    183351
         1.0       0.99      0.32      0.48     33945
         2.0       0.99      0.40      0.57     43167
         3.0       0.96      0.53      0.68    117761

    accuracy                           0.72    378224
   macro avg       0.90      0.56      0.63    378224
weighted avg       0.81      0.72      0.70    378224



In [15]:
KNN_model = KNeighborsClassifier(n_neighbors=4)
# SVC_model.fit(train, train_target)
KNN_model.fit(Xsmote, Ysmote)
KNN_prediction = KNN_model.predict(val)
print(accuracy_score(KNN_prediction, val_target))
# But Confusion Matrix and Classification Report give more details about performance
# print(confusion_matrix(SVC_prediction, y_test))
print(classification_report(KNN_prediction, val_target))

0.8028919370531749
              precision    recall  f1-score   support

         0.0       0.74      1.00      0.85    212425
         1.0       0.99      0.41      0.58     26195
         2.0       0.99      0.49      0.66     34736
         3.0       0.99      0.61      0.76    104868

    accuracy                           0.80    378224
   macro avg       0.93      0.63      0.71    378224
weighted avg       0.85      0.80      0.79    378224



In [9]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(Xsmote, Ysmote)
y_pred = nb.predict(val)
print(accuracy_score(y_pred, val_target))
print(classification_report(y_pred, val_target))

0.2287083844494268
              precision    recall  f1-score   support

         0.0       0.21      0.78      0.33     76876
         1.0       0.17      0.04      0.06     50742
         2.0       0.62      0.06      0.10    188383
         3.0       0.21      0.22      0.21     62223

    accuracy                           0.23    378224
   macro avg       0.30      0.27      0.18    378224
weighted avg       0.41      0.23      0.16    378224



In [16]:
from sklearn.ensemble import RandomForestClassifier

rfm = RandomForestClassifier(n_estimators=70, oob_score=True, n_jobs=-1, random_state=101, max_features=None, min_samples_leaf=30)
rfm.fit(Xsmote, Ysmote)
rfm_pred = rfm.predict(val)
print(accuracy_score(y_pred, val_target))
print(classification_report(y_pred, val_target))

0.2287083844494268
              precision    recall  f1-score   support

         0.0       0.21      0.78      0.33     76876
         1.0       0.17      0.04      0.06     50742
         2.0       0.62      0.06      0.10    188383
         3.0       0.21      0.22      0.21     62223

    accuracy                           0.23    378224
   macro avg       0.30      0.27      0.18    378224
weighted avg       0.41      0.23      0.16    378224



In [12]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='modified_huber', shuffle=True, random_state=101)
sgd.fit(Xsmote, Ysmote)
y_pred_sgd = sgd.predict(val)
print(accuracy_score(y_pred_sgd, val_target))
print(classification_report(y_pred_sgd, val_target))

0.0648213756927112
              precision    recall  f1-score   support

         0.0       0.01      0.83      0.02      3560
         1.0       0.01      0.10      0.03      1643
         2.0       0.91      0.05      0.09    347702
         3.0       0.09      0.22      0.13     25319

    accuracy                           0.06    378224
   macro avg       0.26      0.30      0.06    378224
weighted avg       0.85      0.06      0.09    378224

