# Thesis model

## Notes:
### 1. The commented lines in the cells named as 'Read csv files' and 'Train model' train the model in different datasets and perform validation and/or testing, respectively.

### 2. The global variables in the cell 'Set Global variables' define the emb size, the use of Word2vec or FastText embeddings, the training in the 70% and other parameters to train the thesis model.

## How to run the model:

### If you want to run the embeddings of 300 dimensions you need to set:
### 1. 'emb_size=300'
### 2. 'w2v_emb_size=300'
### 3. 'filters=300' in Conv1D layer. 
### Uncomment also the respective embeddings of 300dimensions.

## The example below is for training the model in the 'assistment_2009_corrected_3lines' dataset with Word2Vec embeddings of 100dimensions. After training, execute the cell with name 'evaluate', to evaluate the model in the test set.

## If need to test the performance in a validation set, you need to:
### 1. set 'train_all=False'
### 2. change the fold number in 'Read csv files' cell to read the corresponding split.
### 3. Uncomment the lines 37-39 and 49-77 in 'Train model' cell.
### 4.Execute the cells 'Calculate mean validation AUC', 'Calculate mean validation accuracy', 'Calculate mean validation accuracy' to measure its performance in AUC, accuracy and loss respectively.

## Import libraries

In [None]:
import os
print('The current directory is: ', os.getcwd())
import numpy as np
print('The numpy version is: ', np.version.version)
import pandas as pd
print('The pandas version is: ', pd.__version__)
!pip install tensorflow==2.1.0
!pip install tensorflow-gpu==2.1.0
# %tensorflow_version 2.1.0
import tensorflow as tf
print('The tensorflow version is: ', tf.__version__)
import matplotlib.pyplot as plt
from scipy.linalg import toeplitz


from tensorflow.keras import backend as K
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Embedding, Concatenate, Activation, Dense, \
                                    Flatten, LSTM, SpatialDropout1D, Dropout, GRU, Bidirectional, \
                                    Lambda, Multiply, Permute, RepeatVector, Masking, TimeDistributed, \
                                    Attention, AdditiveAttention, Conv1D, MaxPool1D, AveragePooling1D, GlobalAveragePooling1D, \
                                    BatchNormalization, Activation, LocallyConnected1D
                                                           
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import AUC, binary_crossentropy
from tensorflow.keras.initializers import Constant, RandomUniform
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler

# GPU information
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.test.gpu_device_name())

## Set Global variables

In [None]:
# USE_W2V = False
USE_W2V = True

# train_all = False
train_all = True

emb_size = 100 # skill embedding size
w2v_emb_size = 100 
L = 50 #10 #20 # history length
max_epochs = 20 # training epochs
beta = 1e-3 # learning rate
batch_size = 32

## Create progress bar to monitor the reading of files

In [None]:
def printProgressBar (iteration, total, prefix = '', suffix = '',
                      decimals = 1, length = 100, fill = 'β–', printEnd = ""):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd, flush=True)
    # Print New Line on Complete
    if iteration == total: 
        print()

## Read 3 lines format of csv files

In [None]:
def read_file_3lines(file, start_user):
    user_ids = []
    skill_ids = []
    correct = []
    with open(file, "r") as f:
        line = f.readline()
        cnt = 0
        user_id = start_user
        try:
            num_responses = int(line)
        except:
            print('Error')
        user_ids += [user_id]*num_responses
        while line:
            line = f.readline()
            if line=="":
                break
            cnt += 1
            if cnt%3 == 0:
                user_id += 1
                num_responses = int(line)
                user_ids += [user_id]*num_responses
            elif cnt%3 == 1:
                skill_ids += line.replace("\n","").split(",")
            elif cnt%3==2:
                correct += line.replace("\n","").split(",")
        user_ids = np.reshape(np.array(user_ids),[-1,1])
        num_unique_users = np.unique(user_ids[:,0]).shape[0]
        skill_ids = np.reshape(np.array(skill_ids).astype(int),[-1,1])
        correct = np.reshape(np.array(correct).astype(int),[-1,1])
        idx = np.reshape((correct==0) + (correct==1), [-1])
        data = np.hstack((user_ids[idx], skill_ids[idx], correct[idx]))
        return data, num_unique_users

## Read csv files

In [None]:
def read_data():
    # use_all_train
    if train_all:
        print('I use all the training: ')
        train_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_train.csv"
        valid_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_valid1.csv"
        test_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_test.csv"
        # train_file = "./data/assist2009_updated/assist2009_updated_train.csv"
        # valid_file = "./data/assist2009_updated/assist2009_updated_valid0.csv"
        # test_file = "./data/assist2009_updated/assist2009_updated_test.csv"
        # train_file = "./data/fsaid1tof3/fsaif1tof3_train.csv"
        # valid_file = "./data/fsaid1tof3/fsaif1tof3_valid1.csv"
        # test_file = "./data/fsaid1tof3/fsaif1tof3_test.csv"
        # train_file = "./data/assistment2012_2013/assistment2012_13_train.csv"
        # valid_file = "./data/assistment2012_2013/assistment2012_13_valid1.csv"
        # test_file = "./data/assistment2012_2013/assistment2012_13_test.csv"
    else:
        train_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_train1.csv"
        valid_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_valid1.csv"
        test_file = "./data/assistment_2009_corrected_3lines/assistment_2009_corrected_test.csv"
        # train_file = "./data/assist2009_updated/assist2009_updated_train4.csv"
        # valid_file = "./data/assist2009_updated/assist2009_updated_valid4.csv"
        # test_file = "./data/assist2009_updated/assist2009_updated_test.csv"
        # train_file = "./data/fsaid1tof3/fsaif1tof3_train1.csv"
        # valid_file = "./data/fsaid1tof3/fsaif1tof3_valid1.csv"
        # test_file = "./data/fsaid1tof3/fsaif1tof3_test.csv"
        # train_file = "./data/assistment2012_2013/assistment2012_13_train1.csv"
        # valid_file = "./data/assistment2012_2013/assistment2012_13_valid1.csv"
        # test_file = "./data/assistment2012_2013/assistment2012_13_test.csv"

    # Read skill names
    sknames_file = "./data/assistment_2009_corrected_3lines/skill_names_corrected.csv"
    # sknames_file = "./data/assist2009_updated/skill_names_updated.csv"
    # sknames_file = "./data/fsaif1tof3/fsaif1tof3_skill_name_question_id.csv"
    # sknames_file = "./data/assistment2012_2013/skill_names_12_13.csv"

    skill_names = pd.read_csv(sknames_file, header=None).values
    
    # Read embedding data
    emb_file = './embeddings/assistment_2009_corrected_3lines/skill_name_embeddings_corrected_100d.csv'
    # emb_file = './embeddings/assistment_2009_corrected_3lines/skill_name_embeddings_corrected_300d.csv'
    # emb_file = './embeddings/assistment_2009_corrected_3lines/Assistment2009_corrected_skname_embeddings_FastText.csv'

    # emb_file = './embeddings/assist2009_updated/skill_name_embeddings_updated100d.csv'
    # emb_file = './embeddings/assist2009_updated/skill_name_embeddings_updated300d.csv'
    # emb_file = './embeddings/assist2009_updated/Assist2009_updated_skname_embeddings_FastText.csv'
    
    # emb_file = './embeddings/fsaif1tof3/fsaif1tof3_embeddings_100d.csv'
    # emb_file = './embeddings/fsaif1tof3/fsaif1tof3_embeddings_300d.csv'
    # emb_file = './embeddings/fsaif1tof3/fsaif1tof3_skname_embeddings_FastText.csv'
    
    # emb_file = "./embeddings/assistment2012_2013/skill_name_embeddings_12_13_100d.csv"
    # emb_file = "./embeddings/assistment2012_2013/skill_name_embeddings_12_13_300d.csv"
    # emb_file = "./embeddings/assistment2012_2013/Assistment2012_13_skname_embeddings_FastText.csv"
    
    embeddings = pd.read_csv(emb_file, header=None)
    
    # Add a zero row at the beginning
    emb_size = embeddings.shape[1]
    print('emb size: ', emb_size)
    embeddings = np.vstack((np.zeros([1,emb_size]), embeddings)) # if i put 1 i add 0 in the first line
    
    """
    Read Train, Validation, Test File
    """
    start_user = 1
    data_train, N_train = read_file_3lines(train_file, start_user)
    start_user += N_train
    data_valid, N_valid = read_file_3lines(valid_file, start_user)
    start_user += N_valid
    data_test, N_test = read_file_3lines(test_file, start_user)
    return data_train, data_test, data_valid, embeddings, skill_names

In [None]:
data_train, data_test, data_valid, embeddings, skill_names = read_data()
print('train:',data_train, '\ntest:',data_test, '\nvalid:', data_valid)
print('train:', data_train.shape, 'test:', data_test.shape, 'valid:', data_valid.shape)
print('\nembeddings:', embeddings)
print('shape after adding 0 in the first line: ', embeddings.shape)
skill_ids = np.unique(np.hstack((
    data_train[:,1],
    data_valid[:,1],
    data_test[:,1]
)))
print('The unique skill ids from all train, test, valid are: ', skill_ids) 

num_skills = len(skill_ids)
train_user_ids = np.unique(data_train[:,0])
valid_user_ids = np.unique(data_valid[:,0])
test_user_ids = np.unique(data_test[:,0])
N_train = len(train_user_ids)
N_valid = len(valid_user_ids)
N_test = len(test_user_ids)
num_students = N_train + N_test + N_valid
print('Number of skills: {}'.format(num_skills))
print('Number of train students: {}'.format(N_train))
print('Number of validation students: {}'.format(N_valid))
print('Number of test students: {}'.format(N_test))
print('(total students: {})'.format(num_students))

## Convert data to desired format

In [None]:
def gen_inputs_targets(data, user_ids, N, prefix):
    printProgressBar(0, N, prefix = prefix, suffix = 'Complete', length = 50)
    
    x = None
    t = None
    start = True
    for i, student_id in enumerate(user_ids):
        # Make an array with all the data for this student
        student_data = data[data[:,0]==student_id]
        skill_hist = toeplitz(student_data[:,1],0.0*np.ones([1,L]))
        responses_hist = toeplitz(student_data[:,2],0.0*np.ones([1,L]))
        student_data = np.hstack((skill_hist,
                                np.fliplr(responses_hist)
                                ))
        if start:
            start = False
            x = student_data[1:,0:2*L-1]
            t = student_data[1:,2*L-1].reshape([-1,1])
        else:
            x = np.vstack((x, student_data[1:,0:2*L-1]))
            t = np.vstack((t, student_data[1:,2*L-1].reshape([-1,1])))
        printProgressBar(i+1, N, prefix = prefix, suffix = 'Complete', length = 50)        
    return x, t

x_train, t_train = gen_inputs_targets(data_train,
                            train_user_ids, N_train, 'Train set:')
print('x_train:', x_train.shape, 'train responses:', t_train.shape)
x_valid, t_valid = gen_inputs_targets(data_valid,
                        valid_user_ids, N_valid, 'Validation set:')
print('x_valid:', x_valid.shape, 'valid responses:', t_valid.shape)

x_test, t_test = gen_inputs_targets(data_test,
                        test_user_ids, N_test, 'Test set:')
print('x_test:', x_test.shape, 'test responses:', t_test.shape)

## Create thesis model

In [None]:
def thesis_model():
    # Inputs
    q_ids = Input(shape=[L], dtype=tf.int32)
    hist_ids = Input(shape=[L-1], dtype=tf.int32)
    if USE_W2V:
        print('I use embeddings:')
        initial_h_emb = RandomUniform(minval=-1/(w2v_emb_size*L),maxval=1/(w2v_emb_size*L))
        hist = Embedding(2, w2v_emb_size,
                    embeddings_initializer=initial_h_emb, trainable=True)(hist_ids)
        hist = SpatialDropout1D(0.2)(hist)

        initial_q_emb = Constant(embeddings/(L*w2v_emb_size))
        print(embeddings)
        print(embeddings/(L*w2v_emb_size))
        q = Embedding(embeddings.shape[0], w2v_emb_size,
                    embeddings_initializer=initial_q_emb, trainable=True)(q_ids)
        q = SpatialDropout1D(0.2)(q)
    else:
        print('I do not use embeddings:')
        initial_h_emb = RandomUniform(minval=-1/(emb_size*L),maxval=1/(emb_size*L))
        hist = Embedding(2, emb_size,
                    embeddings_initializer=initial_h_emb, trainable=True)(hist_ids)
        hist = SpatialDropout1D(0.2)(hist)
        
        initial_q_emb = RandomUniform(minval=-1/(emb_size*L),maxval=1/(emb_size*L))
        q = Embedding(embeddings.shape[0], emb_size,
                    embeddings_initializer=initial_q_emb, trainable=True)(q_ids)
        q = SpatialDropout1D(0.2)(q)

    q_conv = Conv1D(filters=100, kernel_size=3, strides=1, activation="relu")(q)

    merged = Concatenate(axis=1)([q_conv, hist])

    x_lstm = Bidirectional(GRU(units=64, return_sequences=False))(merged)

    x_dropout = Dropout(0.2)(x_lstm)

    d1 = Dense(50, activation='relu')(x_dropout)

    d2 = Dense(25, activation='relu')(d1)

    x = Dense(1, activation='sigmoid')(d2)
    
    model = Model(inputs=[q_ids, hist_ids], outputs=x)

    # visualize the model
    tf.keras.utils.plot_model(
    model,
    to_file="./thesis_model.png",
    show_shapes=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
)
    return model

## Train model

In [None]:
# This function keeps the learning rate at 0.001 for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch):
    if epoch < 10:
        return float(beta)
    else:
        return float(beta * tf.math.exp(0.1 * (10 - epoch)))

callback = LearningRateScheduler(scheduler)


acc_test_base = np.sum(t_test==1)/t_test.shape[0]
print('Baseline test accuracy = {}'.format(acc_test_base))   
print("==================================================")
print('L = {}, emb_size = {}'.format(L, emb_size))

model = thesis_model()
model.summary()

model.compile(optimizer=Adam(learning_rate=beta),
              loss= binary_crossentropy,
              metrics=['accuracy',
                       AUC()
                      ]
             )

model_checkpoint_callback = ModelCheckpoint(filepath="./best_model-{epoch:02d}-{loss:.4f}.h5",
                                            monitor="val_auc",
                                            verbose=1,
                                            save_best_only=True,
                                            mode="max",
                                            save_freq="epoch"
                                            )

history = model.fit([x_train[:,:L].astype(int), x_train[:,L:].astype(int)],
                    t_train,
                    # validation_data=(
                    #     [x_valid[:,:L].astype(int), x_valid[:,L:].astype(int)],
                    #     t_valid),
                    epochs = max_epochs,
                    batch_size=batch_size,
                    verbose=1,
                    callbacks=[callback,
                              #  ReduceLROnPlateau(), 
                              #  model_checkpoint_callback
                               ] 
                    )

# def get_key(keystart, list):
#     for k in list:
#         if k[:len(keystart)] == keystart:
#             return k
#     return None

# keys = history.history.keys()
# key_val_acc = get_key('val_acc', keys)
# key_val_auc = get_key('val_auc', keys)
# key_acc = get_key('acc', keys)
# key_auc = get_key('auc', keys)
# plt.figure(figsize=(9,6))
# ep = np.arange(1,max_epochs+1)
# plt.plot(ep, history.history[key_val_auc], 'r')
# plt.xticks(np.arange(0,max_epochs+1,5, dtype=np.int))
# plt.plot(ep, history.history[key_auc], 'b')
# plt.plot(ep, history.history[key_val_acc], 'r:')
# plt.plot(ep, history.history[key_acc], 'b:')
# plt.legend(['val.auc', 'auc', 'val.acc', 'acc'])
# plt.grid(b=True)
# if USE_W2V:
#     title="L={}, embsize={}, w2v={}, layers={}".format(
#             L, w2v_emb_size, True, num_hidden)
# else:
#     title="L={}, embsize={}, w2v={}, layers={}".format(
#             L, emb_size, False, num_hidden)
# # plt.title(title)
# plt.show(block=False)
# # plt.savefig(cdir + "./image_1.png")

## Evaluate model

In [None]:
model.evaluate([x_test[:,:L].astype(int), x_test[:,L:].astype(int)],
                t_test,
                verbose=1,
                batch_size=batch_size)

## Calculate mean validation AUC

In [None]:
keys = history.history.keys()
print(keys)
key_val_auc = get_key('val_auc', keys)
print(key_val_auc)
X = history.history[key_val_auc]
print(X)
mean_val_auc = np.mean(X)
mean_val_auc

## Calculate mean validation accuracy

In [None]:
keys = history.history.keys()
print(keys)
key_val_acc = get_key('val_accuracy', keys)
print(key_val_acc)
X = history.history[key_val_acc]
print(X)
mean_val_acc = np.mean(X)
mean_val_acc

## Calculate mean validation loss

In [None]:
keys = history.history.keys()
print(keys)
key_val_loss = get_key('val_loss', keys)
print(key_val_loss)
X = history.history[key_val_loss]
print(X)
mean_val_loss = np.mean(X)
mean_val_loss