In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import tensorflow_hub as hub
import tensorflow_text as tf_text

In [None]:
preprocess_path = "D:/roberta/roberta_en_cased_preprocess_1"
path = "D:/roberta/base"
preprocess_model = hub.KerasLayer(preprocess_path)    
encoder = hub.KerasLayer(path)

In [None]:
def get_embedding(questions, preprocess_model, encoder):
    
    text_input = tf.keras.layers.Input(shape=(), dtype = tf.string)
    
    encoder_inputs = preprocess_model(text_input)
    outputs = encoder(encoder_inputs)["sequence_output"] 

    embedding_model = tf.keras.Model(text_input, outputs)
    embeddings = embedding_model(tf.constant(questions))
    return embeddings

In [None]:
questions = pd.read_excel('preprocessing_result/preprocessing_result-fasttext.xlsx')
questions

In [None]:
X = questions['Preprocessed_Question']
cognitive_level = {"Knowledge": 0, "Comprehension": 1, "Application": 2, "Analysis": 3, "Synthesis": 4, "Evaluation": 5}
questions["BT LEVEL"].replace(cognitive_level, inplace = True)
y = questions['BT LEVEL']

In [None]:
cross_validated = StratifiedKFold(n_splits = 10, random_state = 0, shuffle = True)

fold = 1

train_accuracy = []
train_f1_score = []

test_accuracy = []
test_f1_score = []

preprocess_path = "D:/roberta/roberta_en_cased_preprocess_1"
path = "D:/roberta/base"
preprocess_model = hub.KerasLayer(preprocess_path)    
encoder = hub.KerasLayer(path)


for train_index, test_index in cross_validated.split(X,y):
    
    #importing libraries 
    import os
    import random
    import numpy as np
    import tensorflow as tf
    
    tf.keras.backend.clear_session()


    seed = 0
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
     
    import tensorflow_addons as tfa
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.callbacks import CSVLogger
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LeakyReLU, Dropout, SpatialDropout1D

    print("Fold :", fold)
    print("========================================")
    fold += 1
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index],y.iloc[test_index]
    
    
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    
    
    X_train_roberta_base = get_embedding(X_train, preprocess_model, encoder)
    X_test_roberta_base = get_embedding(X_test, preprocess_model, encoder)
    
    training_logger = CSVLogger('log/training-cv.log', separator = ',', append = False)
    
    
    # defining the model
    model = Sequential()
    # input layer
    model.add(Input(shape= (128, 768), name= 'embedding'))
    # CNN-1
    model.add(Conv1D(128, 3, activation = LeakyReLU(alpha = 0.2)))
    # Pooling layer-1
    model.add(MaxPooling1D())
    # CNN-2
    model.add(Conv1D(64, 3, activation = 'relu'))
    # Pooling layer-2
    model.add(GlobalMaxPooling1D())
    # Dense layer
    model.add(Dense(32, activation = 'tanh'))
    #dropout layer
    model.add(Dropout(0.3))
    #output layer
    model.add(Dense(6, activation ='softmax'))

    optimizer = tf.keras.optimizers.RMSprop()
    model.compile(optimizer = optimizer, loss='categorical_crossentropy', metrics= ['accuracy', tfa.metrics.F1Score(6, 'weighted')])
    
    
    # fit the model
    model.fit(X_train_roberta_base, y_train, epochs = 100, batch_size = 8, validation_data = (X_test_roberta_base, y_test),
                                                  callbacks = [training_logger], verbose = 0)

    print("Prediction: ")
    print("=======================================")
    
    log_data = pd.read_csv('log/training-cv.log', sep = ',', engine = 'python')
    
    best_epoch = log_data[['val_accuracy']].idxmax()
    
    print("Best Epoch: ", best_epoch + 1)
    print("Best Test Accuracy: ", log_data.loc[best_epoch]['val_accuracy'])
    print("Best Test F1-score: ", log_data.loc[best_epoch]['val_f1_score'])
    
    train_accuracy.append(log_data.loc[best_epoch]['accuracy'])
    train_f1_score.append(log_data.loc[best_epoch]['f1_score'])
    
    test_accuracy.append(log_data.loc[best_epoch]['val_accuracy'])
    test_f1_score.append(log_data.loc[best_epoch]['val_f1_score'])
    
    tf.keras.backend.clear_session()
    del model

print()
print()
print("===========================================")
print("Training")
print("===========================================")
print("Average Accuracy: ", np.mean(train_accuracy))
print("Average F1 score: ", np.mean(train_f1_score))

print()
print()
print("===========================================")
print("Testing")
print("===========================================")
print("Average Accuracy: ", np.mean(test_accuracy))
print("Average F1 score: ", np.mean(test_f1_score))