In [1]:
import time
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
import xgboost
from keras.layers import *
from keras import layers, models, optimizers
from keras.preprocessing import text, sequence
from sklearn import preprocessing
from sklearn import naive_bayes
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive
import pickle

Using TensorFlow backend.


In [2]:
# Load data
drive.mount('/content/drive', force_remount=True)
DATA_PATH = "/content/drive/My Drive/NLP"

x_data = pickle.load(open(DATA_PATH + '/Processed Data/x_data.pkl', 'rb'))
y_data = pickle.load(open(DATA_PATH + '/Processed Data/y_data.pkl', 'rb'))

x_test = pickle.load(open(DATA_PATH + '/Processed Data/x_test.pkl', 'rb'))
y_test = pickle.load(open(DATA_PATH + '/Processed Data/y_test.pkl', 'rb'))

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
# Transform data
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(x_data)

x_data_tfidf = tfidf_vect.transform(x_data)
x_test_tfidf = tfidf_vect.transform(x_test)

In [4]:
# SVD
svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(x_data_tfidf)

x_data_tfidf_svd = svd.transform(x_data_tfidf)
x_test_tfidf_svd = svd.transform(x_test_tfidf)

In [5]:
# Train Model
def train_model(classifier, x_data, y_data, x_test, y_test, is_neuralnet=False, n_epochs=100):
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

    if is_neuralnet:
        classifier.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=n_epochs, batch_size=512)

        val_predictions = classifier.predict(x_val)
        test_predictions = classifier.predict(x_test)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
    else:
        classifier.fit(x_train, y_train)

        train_predictions = classifier.predict(x_train)
        val_predictions = classifier.predict(x_val)
        test_predictions = classifier.predict(x_test)

    # Evaluation
    print("> Validation Accuracy: ", metrics.accuracy_score(val_predictions, y_val))

    print("> Test Accuracy: ", metrics.accuracy_score(test_predictions, y_test))

    print("> Validation Precision: ", metrics.precision_score(y_val, val_predictions, average='macro'))

    print("> Test Precision: ", metrics.precision_score(y_test, test_predictions, average='macro'))

    print("> Validation Recall: ", metrics.recall_score(y_val, val_predictions, average='macro'))

    print("> Test Recall: ", metrics.recall_score(y_test, test_predictions, average='macro'))

    print("> Validation F1 Score: ", metrics.f1_score(y_val, val_predictions, average='macro'))

    print("> Test F1 Score: ", metrics.f1_score(y_test, test_predictions, average='macro'))

In [None]:
# Naive Bayes
# Naive Bayes (Multinomial)
print("\n\n----- Naive Bayes (Multinomial) -----")
nbm_start = time.time()
train_model(naive_bayes.MultinomialNB(), x_data_tfidf, y_data, x_test_tfidf, y_test, is_neuralnet=False)
nbm_stop = time.time()
print(f"> Training time: {nbm_stop - nbm_start}s")



----- Naive Bayes (Multinomial) -----
> Validation Accuracy:  0.824298264409685
> Test Accuracy:  0.8219097475854232
> Validation Precision:  0.853469381870249
> Test Precision:  0.7815035680496839
> Validation Recall:  0.801754190499151
> Test Recall:  0.7875406940273717
> Validation F1 Score:  0.8135127960186558
> Test F1 Score:  0.7510522562228472
> Training time: 1.048003911972046s


In [None]:
# Naive Bayes (Bernoulli)
print("\n\n----- Naive Bayes (Bernoulli) -----")
nbb_start = time.time()
train_model(naive_bayes.BernoulliNB(), x_data_tfidf, y_data, x_test_tfidf, y_test, is_neuralnet=False)
nbb_stop = time.time()
print(f"> Training time: {nbb_stop - nbb_start}s")



----- Naive Bayes (Bernoulli) -----
> Validation Accuracy:  0.7833726162416971
> Test Accuracy:  0.806651245840435
> Validation Precision:  0.7845546622899705
> Test Precision:  0.7330729601163722
> Validation Recall:  0.7777055342328759
> Test Recall:  0.8005445269565773
> Validation F1 Score:  0.7767318983113088
> Test F1 Score:  0.7486904272729145
> Training time: 1.293684720993042s


In [None]:
# Logistic Regression
print("\n\n----- Logistic Regression -----")
lr_start = time.time()
train_model(linear_model.LogisticRegression(), x_data_tfidf, y_data, x_test_tfidf, y_test, is_neuralnet=False)
lr_stop = time.time()
print(f"> Training time: {lr_stop - lr_start}s")



----- Logistic Regression -----


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


> Validation Accuracy:  0.901649882151275
> Test Accuracy:  0.9015907799691584
> Validation Precision:  0.8989266092922605
> Test Precision:  0.8395186344405364
> Validation Recall:  0.8977008750813453
> Test Recall:  0.883421770643269
> Validation F1 Score:  0.8979496016996339
> Test F1 Score:  0.854683221309598
> Training time: 42.21703290939331s


In [6]:
# Logistic Regression
# print("\n\n----- Logistic Regression -----")
# lr_start = time.time()
model = linear_model.LogisticRegression()
train_model(model, x_data_tfidf, y_data, x_test_tfidf, y_test, is_neuralnet=False)

# pickle.dump(model, open(DATA_PATH + '/Processed Data/model.pkl', 'wb'))
# lr_stop = time.time()
# print(f"> Training time: {lr_stop - lr_start}s")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


> Validation Accuracy:  0.901649882151275
> Test Accuracy:  0.9015907799691584
> Validation Precision:  0.8989266092922605
> Test Precision:  0.8395186344405364
> Validation Recall:  0.8977008750813453
> Test Recall:  0.883421770643269
> Validation F1 Score:  0.8979496016996339
> Test F1 Score:  0.854683221309598


In [7]:
pickle.dump(tfidf_vect, open(DATA_PATH + '/Processed Data/tfidf_vect.pkl', 'wb'))
pickle.dump(svd, open(DATA_PATH + '/Processed Data/svd.pkl', 'wb'))
pickle.dump(model, open(DATA_PATH + '/Processed Data/model.pkl', 'wb'))

In [None]:
# Support Vector Machine
print("\n\n----- Support Vector Machine -----")
svm_start = time.time()
train_model(svm.SVC(), x_data_tfidf_svd, y_data,
            x_test_tfidf_svd, y_test, is_neuralnet=False)
svm_stop = time.time()
print(f"> Training time: {svm_stop - svm_start}s")



----- Support Vector Machine -----
> Validation Accuracy:  0.8965073923291194
> Test Accuracy:  0.8944079214349485
> Validation Precision:  0.8924821639555032
> Test Precision:  0.8281207940856814
> Validation Recall:  0.8915823075447205
> Test Recall:  0.8736836876412273
> Validation F1 Score:  0.8915710621942996
> Test F1 Score:  0.8435685083292975
> Training time: 954.0110738277435s


In [None]:
# Random Forest Classifier
print("\n\n----- Random Forest Classifier -----")
rfc_start = time.time()
train_model(ensemble.RandomForestClassifier(), x_data_tfidf_svd, y_data, x_test_tfidf_svd, y_test, is_neuralnet=False)
rfc_stop = time.time()
print(f"> Training time: {rfc_stop - rfc_start}s")



----- Random Forest Classifier -----
> Validation Accuracy:  0.8377973001928434
> Test Accuracy:  0.8412669426182939
> Validation Precision:  0.8380564573410122
> Test Precision:  0.758669509179911
> Validation Recall:  0.8282432188461175
> Test Recall:  0.8141361093448231
> Validation F1 Score:  0.8318738238862
> Test F1 Score:  0.7702289521203114
> Training time: 102.3763542175293s


In [None]:
# XGBoost
print("\n\n----- XGBoost -----")
xgb_start = time.time()
train_model(xgboost.XGBClassifier(), x_data_tfidf_svd, y_data, x_test_tfidf_svd, y_test, is_neuralnet=False)
xgb_stop = time.time()
print(f"> Training time: {xgb_stop - xgb_start}s")



----- XGBoost -----
> Validation Accuracy:  0.8253696164559674
> Test Accuracy:  0.8227010794578362
> Validation Precision:  0.8201066941114844
> Test Precision:  0.7343159356499722
> Validation Recall:  0.8180185938895043
> Test Recall:  0.8022543408940035
> Validation F1 Score:  0.8180574047434429
> Test F1 Score:  0.7506315766396823
> Training time: 897.8366808891296s


In [None]:
# Deep Neural Network
dnn_start = time.time()
encoder = preprocessing.LabelEncoder()
y_data_n = encoder.fit_transform(y_data)
y_test_n = encoder.fit_transform(y_test)

def create_dnn_model():
    input_layer = Input(shape=(300,))
    layer = Dense(1024, activation='relu')(input_layer)
    layer = Dense(1024, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

classifier = create_dnn_model()
print("\n\n----- Deep Neural Network -----")
train_model(classifier=classifier, x_data=x_data_tfidf_svd, y_data=y_data_n, x_test=x_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True)
dnn_stop = time.time()
print(f"> Training time: {dnn_stop - dnn_start}s")



----- Deep Neural Network -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epo

In [None]:
# Recurrent Neural Network - LSTM
def create_lstm_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = LSTM(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

lstm_start = time.time()
classifier = create_lstm_model()
print("\n\n----- Recurrent Neural Network - LSTM -----")
train_model(classifier=classifier, x_data=x_data_tfidf_svd, y_data=y_data_n, x_test=x_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True)
lstm_stop = time.time()
print(f"> Training time: {lstm_stop - lstm_start}s")



----- Recurrent Neural Network - LSTM -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoc

In [None]:
# Recurrent Neural Network - GRU
def create_gru_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = GRU(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

gru_start = time.time()
classifier = create_gru_model()
print("\n\n----- Recurrent Neural Network - GRU -----")
train_model(classifier=classifier, x_data=x_data_tfidf_svd, y_data=y_data_n, x_test=x_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
gru_stop = time.time()
print(f"> Training time: {gru_stop - gru_start}s")



----- Recurrent Neural Network - GRU -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch

In [None]:
# Bidirectional RNN
def create_brnn_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu'))(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)

    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

brnn_start = time.time()
classifier = create_brnn_model()
print("\n\n----- Bidirectional RNN -----")
train_model(classifier=classifier, x_data=x_data_tfidf_svd, y_data=y_data_n, x_test=x_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
brnn_stop = time.time()
print(f"> Training time: {brnn_stop - brnn_start}s")



----- Bidirectional RNN -----
Train on 42001 samples, validate on 4667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch

In [None]:
# Recurrent Convolutional Neural Network
def create_rcnn_model():
    input_layer = Input(shape=(300,))

    layer = Reshape((10, 30))(input_layer)
    layer = Bidirectional(
        GRU(128, activation='relu', return_sequences=True))(layer)
    layer = Convolution1D(100, 3, activation="relu")(layer)
    layer = Flatten()(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)

    output_layer = Dense(13, activation='softmax')(layer)

    classifier = models.Model(input_layer, output_layer)
    classifier.summary()
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return classifier

rcnn_start = time.time()
classifier = create_rcnn_model()
print("\n\n----- Recurrent Convolutional Neural Network -----")
train_model(classifier=classifier, x_data=x_data_tfidf_svd, y_data=y_data_n, x_test=x_test_tfidf_svd, y_test=y_test_n, is_neuralnet=True, n_epochs=100)
rcnn_stop = time.time()
print(f"> Training time: {rcnn_stop - rcnn_start}s")

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 300)               0         
_________________________________________________________________
reshape_4 (Reshape)          (None, 10, 30)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 256)           122112    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 8, 100)            76900     
_________________________________________________________________
flatten_1 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               410112    
_________________________________________________________________
dense_14 (Dense)             (None, 512)               2626