In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_style("darkgrid")

import common_functions as cf

## Import Data

In [None]:
df_all = pd.read_pickle("merged_df.pkl")

# Feature engineering (basic)

In [None]:
df_fe = cf.fe_basic_features(df_all)

## Data Splitting

In [None]:
train_df,test_df = cf.data_split_TrainTest(df_fe)

In [None]:
X_train,y_train = cf.data_split_FeatLabel(train_df)
X_test,y_test = cf.data_split_FeatLabel(test_df)

## Modeling

### DecisionTreeClassifier (baseline)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier(max_depth=4)
dtc.fit(X_train, y_train)

In [None]:
print(f" Train data : {round(accuracy_score(y_train,dtc.predict(X_train))*100,3)} %")

In [None]:
print(f" Test data : {round(accuracy_score(y_test,dtc.predict(X_test))*100,3)} %")

#### Analyze results

In [None]:
train_df_res = train_df[["action","action_num","label"]]
train_df_res["predict"] = dtc.predict(X_train)

In [None]:
for num in train_df_res.action.unique():
    print(num,train_df_res.query("(action == @num)").predict.unique())

### Random ForestClassifier (baseline)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100,max_depth=4)
rfc.fit(X_train, y_train)

In [None]:
print(f" Train data : {round(accuracy_score(y_train,rfc.predict(X_train))*100,3)} %")

In [None]:
print(f" Test data : {round(accuracy_score(y_test,rfc.predict(X_test))*100,3)} %")

### Neurel Network

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential,load_model

from tensorflow.keras.layers import Dense
#from tensorflow.keras.utils import np_utils
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
input_dim = X_train.shape[1]
cnn = Sequential()
cnn.add(Dense(80, input_dim=input_dim, activation='relu'))
cnn.add(Dense(40, activation='relu'))
cnn.add(Dense(20, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.01)
cnn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max'
                     , verbose=1, save_best_only=True)

In [None]:
#X_train, val_X, y_train, val_y = train_test_split(X_train, y_train, shuffle=True)

In [None]:
X_train_arr = np.asarray(X_train)
y_train_arr = np.asarray(y_train)
X_test_arr = np.asarray(X_test)
y_test_arr = np.asarray(y_test)
# val_X =  np.asarray(val_X)
# val_y =  np.asarray(val_y)

In [None]:
history = cnn.fit(X_train_arr, y_train_arr, epochs=100, batch_size=200
                  ,callbacks=[es,mc],validation_split=0.2)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss',"val_loss"]].plot()
plt.show()

#### Evaluation

- Evaluate last saved model (which is not the best option)

In [None]:
from sklearn.metrics import mean_absolute_error
 
predictions = cnn.predict(X_test_arr)
mean_absolute_error(y_test, predictions)

- Evaluate best model which was monitored and saved

In [None]:
best_saved_model = load_model('best_model.h5')
_, train_acc = best_saved_model.evaluate(X_train_arr, y_train_arr, verbose=0)
_, test_acc = best_saved_model.evaluate(X_test_arr, y_test_arr, verbose=0)
print('Train Data: %.2f, Test data: %.2f' % (train_acc*100, test_acc*100))

# Feature engineering (Lagging)
create info from consecutive points in time in one row.
How to create :  copy one feature into a new column and shift it one or more rows down. Depending on the amount n of lag features, the dataframe gets n additional columns with n shifts in time.

In [None]:
lag_df = cf.fe_lag_features(df_fe,15, ['accel_norm',"rotR_x","rotR_y","rotR_z"])

In [None]:
len(lag_df.columns)

## Data Splitting

In [None]:
train_df_lag,test_df_lag = cf.data_split_TrainTest(lag_df)

In [None]:
X_train_lag,y_train_lag = cf.data_split_FeatLabel(train_df_lag)
X_test_lag,y_test_lag = cf.data_split_FeatLabel(test_df_lag)

## Modeling

### Random ForestClassifier (baseline)

In [None]:
from sklearn.ensemble import RandomForestClassifier

lag_rfc = RandomForestClassifier(n_estimators=100,max_depth=4)
lag_rfc.fit(X_train_lag, y_train_lag)

In [None]:
print(f" The acurracy for Train data is: {round(accuracy_score(y_train_lag,lag_rfc.predict(X_train_lag))*100,3)} %")

In [None]:
print(f" The acurracy for Train data is: {round(accuracy_score(y_test_lag,lag_rfc.predict(X_test_lag))*100,3)} %")


### Neurel Network

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential,load_model

from tensorflow.keras.layers import Dense
#from tensorflow.keras.utils import np_utils
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
input_dim = X_train_lag.shape[1]
cnn = Sequential()
cnn.add(Dense(80, input_dim=input_dim, activation='relu'))
cnn.add(Dense(40, activation='relu'))
cnn.add(Dense(20, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.01)

cnn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

#X_train, val_X, y_train, val_y = train_test_split(X_train, y_train, shuffle=True)

In [None]:
X_train_arr = np.asarray(X_train_lag)
y_train_arr = np.asarray(y_train_lag)
X_test_arr = np.asarray(X_test_lag)
y_test_arr = np.asarray(y_test_lag)
# val_X =  np.asarray(val_X)
# val_y =  np.asarray(val_y)

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)


In [None]:
history = cnn.fit(X_train_arr, y_train_arr, epochs=100, batch_size=200,callbacks=[es,mc]
                  #, validation_data=(test_X,test_y)
                  ,validation_split=0.2
                  )


In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss',"val_loss"]].plot()
plt.show()

#### Evaluation

- Evaluate last saved model (which is not the best option)

In [None]:
from sklearn.metrics import mean_absolute_error
 
predictions = cnn.predict(X_test_arr)
mean_absolute_error(y_test_lag, predictions)

- Evaluate best model which was monitored and saved

In [None]:
best_saved_model = load_model('best_model.h5')
_, train_acc = best_saved_model.evaluate(X_train_arr, y_train_arr, verbose=0)
_, test_acc = best_saved_model.evaluate(X_test_arr, y_test_arr, verbose=0)
print('Train: %.2f, Test: %.2f' % (train_acc*100, test_acc*100))

In [None]:
#TODO use saved_model to predict data
best_model_predictions = best_saved_model.predict(X_test_arr)
mean_absolute_error(y_test_lag, best_model_predictions)

In [None]:
bin_best_model_predictions=[]
for val in best_model_predictions:
    if val >=0.5:
        bin_best_model_predictions.append(1)
    else:
        bin_best_model_predictions.append(0)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(bin_best_model_predictions, y_test_lag)
conf_mat_disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
conf_mat_disp.plot(cmap='Greens')
plt.show()