In [16]:
import time
import xgboost
from keras.layers import *
from keras import layers, models, optimizers
from keras.preprocessing import text, sequence
from sklearn import svm, ensemble, preprocessing, naive_bayes, linear_model, metrics
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from google.colab import drive

In [17]:
# Load data
drive.mount('/content/drive', force_remount=True)
DATA_PATH = "/content/drive/My Drive/NLP"

x_data = pickle.load(open(DATA_PATH + '/Processed Data/x_data.pkl', 'rb'))
y_data = pickle.load(open(DATA_PATH + '/Processed Data/y_data.pkl', 'rb'))

x_test = pickle.load(open(DATA_PATH + '/Processed Data/x_test.pkl', 'rb'))
y_test = pickle.load(open(DATA_PATH + '/Processed Data/y_test.pkl', 'rb'))

Mounted at /content/drive


In [18]:
# Transform data
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_data)

x_data_count = count_vect.transform(x_data)
x_test_count = count_vect.transform(x_test)

In [19]:
# SVD

svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(x_data_count)

x_data_count_svd = svd.transform(x_data_count)
x_test_count_svd = svd.transform(x_test_count)

In [20]:
# Train model
def train_model(classifier, x_data, y_data, x_test, y_test, is_neuralnet=False, n_epochs=100):
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

    if is_neuralnet:
        classifier.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=n_epochs, batch_size=512)

        val_predictions = classifier.predict(x_val)
        test_predictions = classifier.predict(x_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(x_train, y_train)

        train_predictions = classifier.predict(x_train)
        val_predictions = classifier.predict(x_val)
        test_predictions = classifier.predict(x_test)

    # Evaluation

    print("> Validation Accuracy: ", metrics.accuracy_score(val_predictions, y_val))

    print("> Test Accuracy: ", metrics.accuracy_score(test_predictions, y_test))

    print("> Validation Precision: ", metrics.precision_score(y_val, val_predictions, average='macro'))

    print("> Test Precision: ", metrics.precision_score(y_test, test_predictions, average='macro'))

    print("> Validation Recall: ", metrics.recall_score(y_val, val_predictions, average='macro'))

    print("> Test Recall: ", metrics.recall_score(y_test, test_predictions, average='macro'))

    print("> Validation F1 Score: ", metrics.f1_score(y_val, val_predictions, average='macro'))

    print("> Test F1 Score: ", metrics.f1_score(y_test, test_predictions, average='macro'))

In [None]:
# Naive Bayes
# Naive Bayes (Multinomial)
print("\n\n----- Naive Bayes (Multinomial) -----")
nbm_start = time.time()
train_model(naive_bayes.MultinomialNB(), x_data_count, y_data, x_test_count, y_test, is_neuralnet=False)
nbm_stop = time.time()
print(f"> Training time: {nbm_stop - nbm_start}s")



----- Naive Bayes (Multinomial) -----
> Validation Accuracy:  0.8435826012427684
> Test Accuracy:  0.8501744988231474
> Validation Precision:  0.8426191678362662
> Test Precision:  0.7769845062882264
> Validation Recall:  0.8407220781730392
> Test Recall:  0.8398420457325193
> Validation F1 Score:  0.839350686007866
> Test F1 Score:  0.7927757087245066
> Training time: 1.4147300720214844s


In [None]:
# Naive Bayes (Bernoulli)
print("\n\n----- Naive Bayes (Bernoulli) -----")
nbb_start = time.time()
train_model(naive_bayes.BernoulliNB(), x_data_count, y_data, x_test_count, y_test, is_neuralnet=False)
nbb_stop = time.time()
print(f"> Training time: {nbb_stop - nbb_start}s")



----- Naive Bayes (Bernoulli) -----
> Validation Accuracy:  0.7799442896935933
> Test Accuracy:  0.7942740037334632
> Validation Precision:  0.8027950402461533
> Test Precision:  0.7366936320933916
> Validation Recall:  0.7604933977169976
> Test Recall:  0.7746890576648047
> Validation F1 Score:  0.7719398819827019
> Test F1 Score:  0.7308976421241956
> Training time: 1.8563518524169922s


In [None]:
# Logistic Regression
print("\n\n----- Logistic Regression -----")
lr_start = time.time()
train_model(linear_model.LogisticRegression(), x_data_count, y_data, x_test_count, y_test, is_neuralnet=False)
lr_stop = time.time()
print(f"> Training time: {lr_stop - lr_start}s")



----- Logistic Regression -----


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


> Validation Accuracy:  0.8962931219198629
> Test Accuracy:  0.8902483564645727
> Validation Precision:  0.8942833389025264
> Test Precision:  0.8298347189153513
> Validation Recall:  0.8934242222000024
> Test Recall:  0.8740963583786707
> Validation F1 Score:  0.8935561450636768
> Test F1 Score:  0.846348576105192
> Training time: 78.07306122779846s


In [None]:
# Support Vector Machine
print("\n\n----- Support Vector Machine -----")
svm_start = time.time()
train_model(svm.SVC(), x_data_count_svd, y_data, x_test_count_svd, y_test, is_neuralnet=False)
svm_stop = time.time()
print(f"> Training time: {svm_stop - svm_start}s")



----- Support Vector Machine -----
> Validation Accuracy:  0.8510820655667453
> Test Accuracy:  0.8483686389091795
> Validation Precision:  0.8525507101209738
> Test Precision:  0.7694397794446248
> Validation Recall:  0.8430226360320245
> Test Recall:  0.8217213039662644
> Validation F1 Score:  0.8458508494616978
> Test F1 Score:  0.7852749209549328
> Training time: 1603.2711493968964s


In [None]:
# Random Forest Classifier
print("\n\n----- Random Forest Classifier -----")
rfc_start = time.time()
train_model(ensemble.RandomForestClassifier(), x_data_count_svd, y_data, x_test_count_svd, y_test, is_neuralnet=False)
rfc_stop = time.time()
print(f"> Training time: {rfc_stop - rfc_start}s")



----- Random Forest Classifier -----
> Validation Accuracy:  0.7525176773087636
> Test Accuracy:  0.7303181559938317
> Validation Precision:  0.7701735233316843
> Test Precision:  0.6587559818249654
> Validation Recall:  0.7300317977760865
> Test Recall:  0.7038574639563413
> Validation F1 Score:  0.7409222768850399
> Test F1 Score:  0.6510123493060065
> Training time: 107.136647939682s


In [None]:
# XGBoost
print("\n\n----- XGBoost -----")
xgb_start = time.time()
train_model(xgboost.XGBClassifier(), x_data_count_svd, y_data, x_test_count_svd, y_test, is_neuralnet=False)
xgb_stop = time.time()
print(f"> Training time: {xgb_stop - xgb_start}s")



----- XGBoost -----
> Validation Accuracy:  0.748660809942147
> Test Accuracy:  0.7294456618780943
> Validation Precision:  0.7490737887171025
> Test Precision:  0.6447386192654208
> Validation Recall:  0.7341696954119464
> Test Recall:  0.7131607715490713
> Validation F1 Score:  0.7392445146459614
> Test F1 Score:  0.6529114049901502
> Training time: 1095.384387254715s


In [None]:
# Deep Neural Network
dnn_start = time.time()
encoder = preprocessing.LabelEncoder()
y_data_n = encoder.fit_transform(y_data)
y_test_n = encoder.fit_transform(y_test)

def create_dnn_model():
    input_layer = Input(shape=(300,))
    layer = Dense(1024, activation='relu')(input_layer)
    layer = Dense(1024, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

classifier = create_dnn_model()
print("\n\n----- Deep Neural Network -----")
train_model(classifier=classifier, x_data=x_data_count_svd, y_data=y_data_n, x_test=x_test_count_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
dnn_stop = time.time()
print(f"> Training time: {dnn_stop - dnn_start}s")



----- Deep Neural Network -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epo

In [22]:
# Recurrent Neural Network - LSTM
def create_lstm_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = LSTM(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

lstm_start = time.time()
classifier = create_lstm_model()
print("\n\n----- Recurrent Neural Network - LSTM -----")
train_model(classifier=classifier, x_data=x_data_count_svd, y_data=y_data_n, x_test=x_test_count_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
lstm_stop = time.time()
print(f"> Training time: {lstm_stop - lstm_start}s")



----- Recurrent Neural Network - LSTM -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoc

In [23]:
# Recurrent Neural Network - GRU
def create_gru_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = GRU(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier


gru_start = time.time()
classifier = create_gru_model()
print("\n\n----- Recurrent Neural Network - GRU -----")
train_model(classifier=classifier, x_data=x_data_count_svd, y_data=y_data_n, x_test=x_test_count_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
gru_stop = time.time()
print(f"> Training time: {gru_stop - gru_start}s")



----- Recurrent Neural Network - GRU -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch

In [24]:
# Bidirectional RNN
def create_brnn_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu'))(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

brnn_start = time.time()
classifier = create_brnn_model()
print("\n\n----- Bidirectional RNN -----")
train_model(classifier=classifier, x_data=x_data_count_svd, y_data=y_data_n, x_test=x_test_count_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
brnn_stop = time.time()
print(f"> Training time: {brnn_stop - brnn_start}s")



----- Bidirectional RNN -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch

In [25]:
# Recurrent Convolutional Neural Network
def create_rcnn_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu', return_sequences=True))(layer)
    layer = Convolution1D(100, 3, activation="relu")(layer)
    layer = Flatten()(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)
    classifier.summary()
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

rcnn_start = time.time()
classifier = create_rcnn_model()
print("\n\n----- Recurrent Convolutional Neural Network -----")
train_model(classifier=classifier, x_data=x_data_count_svd, y_data=y_data_n, x_test=x_test_count_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
rcnn_stop = time.time()
print(f"> Training time: {rcnn_stop - rcnn_start}s")

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 300)               0         
_________________________________________________________________
reshape_4 (Reshape)          (None, 10, 30)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 256)           122112    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 8, 100)            76900     
_________________________________________________________________
flatten_1 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 512)               410112    
_________________________________________________________________
dense_18 (Dense)             (None, 512)               2626