In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_style("darkgrid")

import common_functions as cf

## Import Data

In [None]:
df_all = pd.read_pickle("merged_w_df.pkl")

In [None]:
df_all.shape

In [None]:
df_all.head(2)

# Feature engineering (basic)

In [None]:
df_fe = cf.fe_basic_features(df_all)

In [None]:
df_fe.head(2)

In [None]:
df_fe.shape

## Data Splitting(Test/train)

In [None]:
train_df,test_df = cf.data_split_TrainTest(df_fe)

In [None]:
train_df.shape,test_df.shape

## Preprocess (Normalizing)

In [None]:
from sklearn.preprocessing import StandardScaler
last_feat_column = df_fe.columns[-4]

for act_num in train_df.action_num.unique():
    std = StandardScaler().set_output(transform='pandas')
    
    train_df_w_act_num = train_df.loc[train_df.action_num == act_num,:last_feat_column].copy()
    std.fit(train_df_w_act_num)
    train_df.loc[train_df.action_num == act_num,:last_feat_column] = std.transform(train_df_w_act_num)


for act_num in test_df.action_num.unique():
    std = StandardScaler().set_output(transform='pandas')
    
    test_df_w_act_num = test_df.loc[test_df.action_num == act_num,:last_feat_column].copy()
    std.fit(test_df_w_act_num)
    test_df.loc[test_df.action_num == act_num,:last_feat_column] = std.transform(test_df_w_act_num)

In [None]:
train_df.shape,test_df.shape

## Modeling

### Split Data (Feature/Label)

In [None]:
def load_dataset():
    X_train,y_train = cf.data_split_FeatLabel(train_df)
    X_test,y_test = cf.data_split_FeatLabel(test_df)
   
    X_train = np.asarray(X_train).reshape(int(X_train.shape[0]/cf.SEQUENCE_SIZE) , cf.SEQUENCE_SIZE, X_train.shape[1])
    y_train = np.asarray(y_train).reshape(int(y_train.shape[0]/cf.SEQUENCE_SIZE) , cf.SEQUENCE_SIZE)
    y_train_compact=[]
    for lis in y_train:
        if sum(lis) == 0:
            y_train_compact.append(0)
        elif sum(lis) == cf.SEQUENCE_SIZE:
            y_train_compact.append(1)
        else:
            print("something wrong")
    y_train_compact = np.asarray(y_train_compact)
    X_test = np.asarray(X_test).reshape(int(X_test.shape[0]/cf.SEQUENCE_SIZE) , cf.SEQUENCE_SIZE, X_test.shape[1])
    y_test = np.asarray(y_test).reshape(int(y_test.shape[0]/cf.SEQUENCE_SIZE) , cf.SEQUENCE_SIZE)
    y_test_compact=[]
    for lis in y_test:
        if sum(lis) == 0:
            y_test_compact.append(0)
        elif sum(lis) == cf.SEQUENCE_SIZE:
            y_test_compact.append(1)
        else:
            print("something wrong")
    y_test_compact = np.asarray(y_test_compact)

    return  X_train,y_train_compact,X_test,y_test_compact

In [None]:
# load data
trainX, trainy, testX, testy = load_dataset()

In [None]:
trainX.shape,trainy.shape, testX.shape, testy.shape

In [None]:
n_features = trainX.shape[2]

# reshape data into time steps of sub-sequences
n_steps = 2
n_length = int(cf.SEQUENCE_SIZE/n_steps) 

trainX_resh = trainX.reshape((trainX.shape[0], n_steps, n_length, n_features))
testX_resh = testX.reshape((testX.shape[0], n_steps, n_length, n_features))

In [None]:
trainy.shape, testy.shape

In [None]:
trainX_resh.shape,testX_resh.shape

### Neurel Network

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Flatten,Dropout,LSTM,TimeDistributed,MaxPooling1D,Conv1D

from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
import sklearn
tf.random.set_seed(20) # to fix the randomization
np.random.seed(20)# to fix the randomizationin sklearn
sklearn.random.seed(1)

In [None]:
# define model
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=10, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_features)))
model.add(TimeDistributed(Conv1D(filters=10, kernel_size=3, activation='relu')))
model.add(TimeDistributed(Dropout(0.2)))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())


In [None]:
opt = keras.optimizers.Adam(learning_rate=0.02)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

X_train, val_X, y_train, val_y = train_test_split(trainX_resh, trainy, shuffle=True,test_size=0.2 )

In [None]:
X_train.shape, val_X.shape, y_train.shape, val_y.shape

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
mc = ModelCheckpoint('best_model_LSTM.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
# fit network
history = model.fit(X_train, y_train, epochs=50, batch_size=250,validation_data=(val_X,val_y),callbacks=[es,mc],verbose=0)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss',"val_loss"]].plot()
plt.show()

#### Evaluate

In [None]:
best_saved_model = load_model('best_model_LSTM.h5')
_, train_acc = best_saved_model.evaluate(trainX_resh, trainy, verbose=0)
_, test_acc = best_saved_model.evaluate(testX_resh, testy, verbose=0)
print('Train: %.2f, Test: %.2f' % (train_acc*100, test_acc*100))