In [1]:
import os
import random
import numpy as np
import tensorflow as tf
seed = 1
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
import gensim
import pandas as pd
import gensim.downloader as api
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
import tensorflow_addons as tfa
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D

In [4]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [5]:
def convert_to_sequence(questions):
    X = questions['Preprocessed_Question']
    cognitive_level = {"Knowledge": 0, "Comprehension": 1, "Application": 2, "Analysis": 3, "Synthesis": 4, "Evaluation": 5}
    questions["BT LEVEL"].replace(cognitive_level, inplace = True)
    y = questions['BT LEVEL'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.10, 
                                        stratify = questions['BT LEVEL'], random_state = 1)
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    
    tokenizer = Tokenizer(oov_token ='<OOV>')
    tokenizer.fit_on_texts(X_train)
    vocab = tokenizer.word_index
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    max_que_len = max(len(x) for x in X_train_seq)
    padded_X_train_seq = pad_sequences(X_train_seq, maxlen = max_que_len, padding = 'post')
    padded_X_test_seq = pad_sequences(X_test_seq, maxlen = max_que_len, padding = 'post')
    
    return padded_X_train_seq, y_train, padded_X_test_seq, y_test, vocab, max_que_len

In [6]:
def get_embedding(vocab):
    embedding_model = api.load('word2vec-google-news-300')
    embeddings = np.zeros((len(vocab) + 1, 300))
    for word, index in vocab.items():
        if word in embedding_model.key_to_index:
            embedding_vector = embedding_model[word]
            embeddings[index] = embedding_vector
        else:
            print(word)
    return embeddings

In [7]:
def build_model(vocab_size, embedding_len, max_que_len, embeddings):
  
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = embedding_len, weights = [embeddings], 
                        input_length = max_que_len,  trainable = True))
    model.add(Conv1D(32, 5, activation ='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(6, activation ='softmax'))
    return model

In [21]:
def evaluate_model(model, X_test, y_test):
    pred = model.predict(X_test)
    predicted_classes = np.argmax(pred, axis = 1)
    true_classes = np.argmax(y_test, axis = 1)
    accuracy = accuracy_score(true_classes, predicted_classes)
    print(f"Accuracy: {accuracy}")
    f1_ = f1_score(true_classes, predicted_classes, average = 'weighted')
    print(f"F1 score: {f1_}")
    print(classification_report(true_classes, predicted_classes))

In [9]:
def plot_result(log_data): 
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['loss'], label ='train')
    plt.plot(log_data['val_loss'], label ='test')
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['accuracy'], label ='train')
    plt.plot(log_data['val_accuracy'], label ='test')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['f1_score'], label ='train')
    plt.plot(log_data['val_f1_score'], label ='test')
    plt.title('Model F1 Score')
    plt.ylabel('F1 Score')
    plt.xlabel('Epoch')
    plt.legend()
    plt.show()

In [10]:
questions = pd.read_excel('preprocessing_result/preprocessing_result-w2v.xlsx')
questions

Unnamed: 0,Question,Preprocessed_Question,BT LEVEL
0,"Suppose prices of two goods are constant, expl...",suppose price good constant explain what happe...,Comprehension
1,Explain the concept of price leadership observ...,explain concept price leadership observe condi...,Comprehension
2,Define profit. Briefly explain how accounting ...,define profit briefly explain how account prof...,Comprehension
3,Describe the assumptions of monopolistic compe...,describe assumption monopolistic competitive m...,Comprehension
4,Explain the meaning of the law of diminishing ...,explain mean law diminish marginal return brie...,Comprehension
...,...,...,...
2517,PEST and SWOT are popular strategy tools. Disc...,pest swot popular strategy tool discuss how to...,Application
2518,List the advantages and disadvantages of Publi...,list advantage disadvantage public offer,Knowledge
2519,Show your calculations for all THREE (3) optio...,show calculation option discuss which option p...,Analysis
2520,Currently the product life cycle for Apple iPo...,currently product life cycle apple ipod growth...,Evaluation


In [11]:
padded_X_train_seq, y_train, padded_X_test_seq, y_test, vocab, max_que_len = convert_to_sequence(questions)

In [12]:
embeddings_w2v = get_embedding(vocab)
np.save('embedding/word2vec/embeddings.npy', embeddings_w2v)

<OOV>
tqm
aov
bjt
nyquist
edman
sdn
bhd
infomediary
bnm
and
phosphoru
treynor
venn
mccg
glutamic
of
maslow
langmuir
blodgett
komugi
nlp
eoq
mlp
finfet
pcr
to
hofstede
mvc
mesophyll
steagall
a
berhad
utar
chemostat
schein
cambodia
radiobutton
mrna
ldh
fermi
markov
mooc
perak
adts
kaseem
airasia
perakian
cvn
junebank
rna
firedup
qlassic
gcb
irr
npv
mirr
aspartic
genbank
mrp
pondy
quasineutral
addie
ehp
alavi
leidner
cec
abap
openerp
sugarcrm
columbiana
spiration
cooperativity
electroosmosis
greimas
vlsi
hexapeptide
kmk
nextseq
schrodinger
chymotrypsin
passivator
dfa
arraylist
bubblesort
quicksort
matrik
trna
rct
putlog
cadbury
dirac
welliver
kampar
radley
blosum
vle
rle
jusco
kinta
ipoh
pseudocode
kahneman
tversky
dictogloss
hyperarid
nonaka
eqrnhl
loreal
spss
neolocal
gaucher
michaeli
faraday
sawaya
browne
laplace
sethour
refseq
solum
sendredirect
linebreeding
huffman
phenylpropanoid
pourbaix
howie
mesocarp
minisatellite
realaudio
oxyperoxidase
luedeking
piret
pullulan
cladogram
eurocur

In [13]:
embeddings_w2v = np.load('embedding/word2vec/embeddings.npy')

In [14]:
#callbacks 
training_logger = CSVLogger('log/W2V/training.log', separator = ',', append = False)

In [15]:
#model
model = build_model(len(vocab) + 1, 300, max_que_len, embeddings_w2v)
# compiling the model
model.compile(optimizer = 'RMSprop', loss='categorical_crossentropy', metrics=['accuracy', tfa.metrics.F1Score(6, 'weighted')])
# printing summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 44, 300)           1109700   
                                                                 
 conv1d (Conv1D)             (None, 40, 32)            48032     
                                                                 
 global_max_pooling1d (Globa  (None, 32)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 6)                 198       
                                                                 
Total params: 1,157,930
Trainable params: 1,157,930
Non-trainable params: 0
_________________________________________________________________


In [16]:
# fit the model
model.fit(padded_X_train_seq, y_train, epochs = 50, batch_size = 8, validation_data = (padded_X_test_seq, y_test),
                                                                      callbacks = [training_logger], verbose = 1)
#saving the trained model
model.save('saved_models/W2V/EQCM.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
log_data = pd.read_csv('log/W2V/training.log', sep = ',', engine = 'python')
log_data.head(50)

Unnamed: 0,epoch,accuracy,f1_score,loss,val_accuracy,val_f1_score,val_loss
0,0,0.561922,0.525717,1.255105,0.774704,0.770956,0.800958
1,1,0.854121,0.853032,0.5430162,0.790514,0.790615,0.593165
2,2,0.929484,0.929341,0.2573461,0.822134,0.819784,0.524282
3,3,0.976642,0.976646,0.1156323,0.802372,0.801287,0.525948
4,4,0.9881,0.988109,0.05082107,0.814229,0.813131,0.57295
5,5,0.99383,0.993833,0.02083978,0.814229,0.811919,0.603155
6,6,0.998237,0.998237,0.009461265,0.814229,0.811569,0.689331
7,7,0.999559,0.999559,0.003848141,0.826087,0.824204,0.692033
8,8,0.999559,0.999559,0.002033538,0.814229,0.812356,0.803235
9,9,1.0,1.0,0.0005662903,0.814229,0.811512,0.897034


In [18]:
log_data['val_accuracy'].describe()

count    50.000000
mean      0.803399
std       0.008813
min       0.774704
25%       0.798419
50%       0.802372
75%       0.806324
max       0.826087
Name: val_accuracy, dtype: float64

In [19]:
log_data['val_f1_score'].describe()

count    50.000000
mean      0.801115
std       0.008961
min       0.770956
25%       0.796335
50%       0.799942
75%       0.803495
max       0.825300
Name: val_f1_score, dtype: float64

In [22]:
model.load_weights("saved_models/W2V/EQCM.h5")
evaluate_model(model, padded_X_test_seq, y_test)

Accuracy: 0.8023715415019763
F1 score: 0.7994878277014907
              precision    recall  f1-score   support

           0       0.71      0.83      0.76        35
           1       0.83      0.88      0.85        96
           2       0.82      0.56      0.67        32
           3       0.84      0.90      0.87        30
           4       0.71      0.71      0.71        28
           5       0.86      0.78      0.82        32

    accuracy                           0.80       253
   macro avg       0.80      0.78      0.78       253
weighted avg       0.81      0.80      0.80       253



In [None]:
log_data[['val_accuracy']].idxmax() + 1

In [None]:
plot_result(log_data)