<a href="https://colab.research.google.com/github/LeechXDD/9417_Pro_Project/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install tensorflow_addons
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import tensorflow_addons as tfa
from sklearn.svm import SVC
import joblib as job




TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



##Data preprocessing

In [3]:
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

In [4]:
from google.colab import drive
drive.mount('/content/drive')

dataset_df = pd.read_csv('/content/drive/MyDrive/9417project/train.csv', dtype=dtypes)
labels = pd.read_csv('/content/drive/MyDrive/9417project/train_labels.csv')

Mounted at /content/drive


In [5]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [6]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y',
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [7]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [8]:
dataset_df = feature_engineer(dataset_df)

In [9]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)

In [10]:
# Fetch the unique list of user sessions in the validation dataset. We assigned
# `session_id` as the index of our feature engineered dataset. Hence fetching
# the unique values in the index column will give us a list of users in the
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is:
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s.
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
SVM_prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
# Create an empty dictionary to store the models created for each question.
models = {}

f1_scores = []

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [11]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')  # Assume 'label' is your target column
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [12]:
def prepare_data(df, sequence_length):
    """
    Prepare data for LSTM, based on sequence length.
    """
    # Get labels
    labels = df.groupby(df.index)["correct"].last().values
    # Drop the 'correct' and 'level_group' columns from the features
    features = df.drop(columns=["correct", "level_group"])

    data = []
    label = []

    # Create sequences
    for user, group in features.groupby(features.index):
        user_features = group.values
        if len(user_features) < sequence_length:
            padding = np.zeros((sequence_length - len(user_features), user_features.shape[1]))
            user_features = np.concatenate((user_features, padding))
        data.append(user_features)
        label.append(labels[len(data)-1])

    # Convert lists to numpy arrays
    data = np.array(data)
    label = np.array(label)

    return data, label


In [13]:
# model define
input_shape = (100, 32)

In [None]:
sequence_length = 100
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):
    # Select level group for the question based on the q_no.
    if q_no<=4: grp = '0-4'
    elif q_no<=12: grp = '5-12'
    elif q_no<=18: grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected level group.
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    print(len(valid_users))

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    # Prepare data for LSTM
    train_data, train_labels = prepare_data(train_df, sequence_length)
    valid_data, valid_labels = prepare_data(valid_df, sequence_length)




    print(len(valid_data))
    print(len(valid_labels))

    # Define the model
    model = Sequential()
    model.add(LSTM(64, input_shape=(sequence_length, train_data.shape[-1]), return_sequences=True))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(train_data, train_labels, epochs=3, validation_data=(valid_data, valid_labels))
    model.save(f'/content/drive/MyDrive/9417project/models/LSTM_models/model_{grp}_{q_no}.hdf5')
    # Store the model
    models[f'{grp}_{q_no}'] = model

    # Evaluate the model
    evaluation = model.evaluate(valid_data, valid_labels)
    evaluation_dict[q_no] = evaluation[1]
    print(f"Validation accuracy: {evaluation[1]}")

    # Make predictions
    predictions = model.predict(valid_data)
    prediction_df.loc[valid_users, q_no-1] = predictions.flatten()


    binary_predictions = [1 if p > 0.62 else 0 for p in predictions]
    # Get the true labels
    true_labels = valid_labels[:len(valid_data)]

    # Calculate the F1 score and add it to the list
    f1 = f1_score(true_labels, binary_predictions)
    f1_scores.append(f1)



In [None]:
# for name, value in evaluation_dict.items():
#   print(f"question {name}: accuracy {value:.4f}")

# print("\nAverage accuracy", sum(evaluation_dict.values())/18)

In [28]:
from tensorflow.keras.models import load_model
for q_no in range(1,19):
  if q_no<=4: grp = '0-4'
  elif q_no<=12: grp = '5-12'
  elif q_no<=18: grp = '13-22'
  model_path = f'/content/drive/MyDrive/9417project/models/LSTM_models/model_{grp}_{q_no}.hdf5'
  model = load_model(model_path)


  predictions = model.predict(valid_data)
  prediction_df.loc[valid_users, q_no-1] = predictions.flatten()




In [25]:
# Ensemble model result
LSTM_file_path ='/content/drive/MyDrive/9417project/predictions/LSTM_pred.npy'
SVM_file_path ='/content/drive/MyDrive/9417project/predictions/SVM_pred.npy'
np.save(LSTM_file_path,prediction_df)
LSTM_pred = np.load(LSTM_file_path,allow_pickle=True)
SVM_pred = np.load(SVM_file_path,allow_pickle=True)


ensemble = 0.7 * SVM_pred + 0.3 * LSTM_pred



In [None]:
# # Print the F1 score for each question
# for i, score in enumerate(f1_scores, start=1):
#     print(f"Question {i} : F1 {score:.4f}")

# # Calculate and print the average F1 score
# average_f1 = sum(f1_scores) / len(f1_scores)
# print(f"\nAverage F1 score: {average_f1:.4f}")

In [27]:
import tensorflow as tf

max_score = 0; best_threshold = 0

# Create a dataframe of required size:
# (no: of users in validation set x no: of questions) initialized to zero values
# to store true values of the label `correct`.
true_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)
for i in range(18):
    # Get the true labels.
    tmp = labels.loc[labels.q == i+1].set_index('session').loc[VALID_USER_LIST]
    true_df[i] = tmp.correct.values

# Loop through threshold values from 0.4 to 0.8 and select the threshold with
# the highest `F1 score`.
for threshold in np.arange(0.4,0.8,0.01):
    metric = tfa.metrics.F1Score(num_classes=2,average="macro",threshold=threshold)
    y_true = tf.one_hot(true_df.values.reshape((-1)), depth=2)
    y_pred = tf.one_hot((ensemble.reshape((-1))>threshold).astype('int'), depth=2)
    metric.update_state(y_true, y_pred)
    f1_score = metric.result().numpy()
    if f1_score > max_score:
        max_score = f1_score
        best_threshold = threshold


print("Best threshold ", best_threshold, "\tF1 score ", max_score)

Best threshold  0.5100000000000001 	F1 score  0.61750203
