# Import Data

In [8]:
SMALLEST_DS_SIZE = 100
ROLLING_STEPS = 100

# Get data file names
START_T_COUNT = 0
START_F_COUNT = 500

SPLIT_T_RATIO = 0.7

NUM_NO_FEAT_COLUMN = 3

In [9]:
import pandas as pd
import numpy as np

import glob
import os
import random

from tensorflow import keras
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import make_pipeline

import tensorflow as tf
import sklearn

tf.random.set_seed(20) # to fix the randomization
np.random.seed(20) # to fix the randomizationin sklearn
sklearn.random.seed(1)
random.seed(10)

In [10]:
path = r'D:\_1_Technicals\DataScience_Bootcamp\Final_project\Flutter_ArduinoConnect\00_DataSets'
filenames = glob.glob(path + "/**/*.csv")

# shuffeling the files as its always ordered alphabetic

shuffled_filenames = random.sample(filenames, len(filenames))

dfs = []
num_t = START_T_COUNT
num_f = START_F_COUNT
for filename in shuffled_filenames:
   
    df = pd.read_csv(filename, index_col=False)
    if df.shape[0] > SMALLEST_DS_SIZE:
        df.columns= ["time","rotR_x","rotR_y","rotR_z","acc_x","acc_y","acc_z","or_x","or_y","or_z","grav_x","grav_y","grav_z"]
              
        _, tail = os.path.split(filename)
        df["action"] = str(tail).split(".")[0]
        

        if df.action.str.startswith("t_").sum():
            df["label"] = 1
            df["action_num"] = num_t # 0 - 499
            num_t += 1
        else:
            df["label"] = 0
            df["action_num"] = num_f # 500 - 1000
            num_f += 1
        
        df = df.drop(columns="time")
        dfs.append(df)
        
df_all = pd.concat(dfs,axis=0)

# Split Data (Train, Test)

In [11]:

num_t_ds = len(df_all.query("action_num < @START_F_COUNT").action.unique())
num_f_ds = len(df_all.query("action_num >= @START_F_COUNT").action.unique())

num_train_t = round(num_t_ds * SPLIT_T_RATIO)
num_train_f = round(num_f_ds * SPLIT_T_RATIO)

train_df = df_all[(df_all.action_num.between(START_T_COUNT,START_T_COUNT+num_train_t,inclusive="both"))|
                (df_all.action_num.between(START_F_COUNT,START_F_COUNT+num_train_f,inclusive="both"))]

test_df = df_all[(df_all.action_num.between(START_T_COUNT+num_train_t,START_F_COUNT,inclusive="neither"))|
                (df_all.action_num > START_F_COUNT+num_train_f)]

# Split Data (X, y)

In [12]:
X_train = train_df.iloc[:,:(-1*NUM_NO_FEAT_COLUMN)]
y_train = train_df.label

X_test = test_df.iloc[:,:(-1*NUM_NO_FEAT_COLUMN)]
y_test = test_df.label

X_train, val_X, y_train, val_y = train_test_split(X_train, y_train, shuffle=True,test_size=0.2 )

## Pipeline

### Create preprocess Transformer

In [13]:
#takes df of size roll_steps+1, divide data, 

class PreprocessDataTransformer(BaseEstimator, TransformerMixin,auto_wrap_output_keys=None):
    def __init__(self, *, roll_steps= 100, state="train"):
        self.roll_steps = roll_steps
        self.state = state
        #super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        try:
            X_.columns= ["time","rotR_x","rotR_y","rotR_z","acc_x","acc_y","acc_z","or_x","or_y","or_z","grav_x","grav_y","grav_z"]
            X_.acc_x = X_.acc_x/4096.0
            X_.acc_y = X_.acc_y/4096.0
            X_.acc_z = X_.acc_z/4096.0
            
            X_.rotR_x = X_.rotR_x/(16.4*100)
            X_.rotR_y = X_.rotR_y/(16.4*100)
            X_.rotR_z = X_.rotR_z/(16.4*100)
            
            X_.grav_x = X_.grav_x/4096.0
            X_.grav_y = X_.grav_y/4096.0
            X_.grav_z = X_.grav_z/4096.0
            
            X_.or_x = X_.or_x/100
            X_.or_y = X_.or_y/100
            X_.or_z = X_.or_z/100
            X_ = X_.drop(columns="time")
        except Exception as e:
            print(" Normalization failed: ", e)
        
        try:
            X_.insert(0, 'accel_x', X_.acc_x + X_.grav_x)
            X_.insert(1, 'accel_y', X_.acc_y + X_.grav_y)
            X_.insert(2, 'accel_z', X_.acc_z + X_.grav_z)
            X_.insert(3, 'accel_norm', np.sqrt(X_.accel_x**2 + X_.accel_y**2 + X_.accel_z**2))
            X_ = X_.drop(['accel_x', 'accel_y', 'accel_z'], axis=1)
        except Exception as e:
            print(" Basic Features failed: ", e)
    
        try:
            j = 1
            for i in X_.columns:
                X_.insert(j, f'{i}_rmean', X_[i].rolling(self.roll_steps).mean())
                X_.insert(j+1, f'{i}_rstd', X_[i].rolling(self.roll_steps).std())
                X_.insert(j+2, f'{i}_rmed', X_[i].rolling(self.roll_steps).median())
                j += 4 
                
            if self.state == "train":
                # Dropping all rows where the lag overlapped two different subjects/trials.
                for i in range(self.roll_steps):
                    X_ = X_.drop([i])
            else:
                X_ = X_.iloc[-1]  # If deployment, take last part
                X_ = np.asarray(X_).reshape(1,-1)
        except Exception as e:
            print(" Rolling failed: ", e)
            
        if type(X_) != np.ndarray :
            X_ = np.asarray(X_)
        return X_

In [14]:
my_prepocessor = PreprocessDataTransformer(roll_steps= 100, state="train")

### Create MLP Model

In [21]:
X_train.shape[1]

12

In [15]:
input_dim = X_train.shape[1]
cnn = Sequential()
cnn.add(Dense(100, input_dim=input_dim, activation='relu'))
cnn.add(Dropout(rate =0.2)) # drop some of the neurals in the back prop. analysis to avoid overfitting
cnn.add(BatchNormalization())
cnn.add(Dense(40, activation='relu'))
cnn.add(Dropout(rate =0.5))
cnn.add(BatchNormalization())
cnn.add(Dense(20, activation='relu'))
cnn.add(Dropout(rate =0.4))
cnn.add(BatchNormalization())
cnn.add(Dense(1, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.005)
cnn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [16]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=0, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
# cnn.fit(X_train, y_train, epochs=100, batch_size=250,callbacks=[es,mc]
#                   , validation_data=(val_X,val_y)
#                   )

### Build Pipeline

In [17]:

pipe = make_pipeline(my_prepocessor,cnn)

In [20]:
pipe.fit(X_train, y_train, sequential__epochs=100, sequential__batch_size=250,sequential__callbacks=[es,mc], sequential__validation_data=(val_X,val_y))

 Normalization failed:  Length mismatch: Expected axis has 12 elements, new values have 13 elements


ValueError: Data cardinality is ambiguous:
  x sizes: 25221
  y sizes: 33201
Make sure all arrays contain the same number of samples.

### Evaluation

In [None]:
best_saved_model = load_model('best_model.h5')
_, train_acc = best_saved_model.evaluate(X_train, y_train, verbose=0)
_, test_acc = best_saved_model.evaluate(X_test, y_test, verbose=0)
print('Train: %.2f, Test: %.2f' % (train_acc*100, test_acc*100))

In [None]:

best_model_predictions = best_saved_model.predict(X_test)
mae = round(mean_absolute_error(y_test, best_model_predictions),3)
print("mean absolute error is",mae)

In [None]:
# convert predicted label rom analog to binary value
bin_best_model_predictions=[]
for val in best_model_predictions:
    if val >=0.5:
        bin_best_model_predictions.append(1)
    else:
        bin_best_model_predictions.append(0)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(bin_best_model_predictions, y_test_roll)
conf_mat_disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
conf_mat_disp.plot(cmap='Greens')
plt.show()