In [1]:
!pip install gensim==3.8.3
!pip install keras --upgrade
!pip install pandas --upgrade
!pip install tensorflow --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 1.9 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 9.6 MB/s 
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.9.0
    Uninstalling keras-2.9.0:
      Successfully uninstalled keras-2.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that

In [2]:
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

In [3]:
vocab_size = 290419
# WORD2VEC 
W2V_SIZE = 300
SEQUENCE_LENGTH = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#LOADING
load_dir = '/content/drive/MyDrive/nns/'
train_test_dir = load_dir+'saved_train_test/'
embedding_matrix = np.load(load_dir+'embedding_matrix.npy')

x_train = np.load(train_test_dir+'x_train.npy')
y_train = np.load(train_test_dir+'y_train.npy')

x_test = np.load(train_test_dir+'x_test.npy')
y_test = np.load(train_test_dir+'y_test.npy')

###NN Model

In [13]:
#HYPER PARAMETERS
model_name = "LSTM"
num_epochs = 10
batch_size = 1024
learning_rate = 1e-5
lstm_units = 128
momentum=.9
sequence_length=300
activation="sigmoid"
optimizer='adam'

In [14]:
#MODEL
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], 
                            input_length=SEQUENCE_LENGTH, 
                            #batch_input_shape=[batch_size, None], 
                            trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.LSTM(lstm_units,
                               dropout=0.2, 
                               recurrent_dropout=0.2,
                               activation=activation,
                               recurrent_initializer='glorot_uniform'))
model.add(Dense(1, activation='sigmoid'))


model.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 300)          87125700  
                                                                 
 dropout_2 (Dropout)         (None, 300, 300)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               219648    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 87,345,477
Trainable params: 219,777
Non-trainable params: 87,125,700
_________________________________________________________________


In [15]:
#OPTIMIZATION
callbacks = [ tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]
#optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
#TRAINING
%%time
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 42min 22s, sys: 5min 34s, total: 47min 56s
Wall time: 30min 9s


In [17]:
#TESTING
score = model.evaluate(x_test, y_test, batch_size=batch_size)

print("ACCURACY:",score[1])
print("LOSS:",score[0])

acc = history.history['accuracy']
print("acc:", acc)
val_acc = history.history['val_accuracy']
print("val_acc:", val_acc)
loss = history.history['loss']
print("loss:", loss)
val_loss = history.history['val_loss']
print("val_loss:", val_loss)


ACCURACY: 0.7214249968528748
LOSS: 0.5456164479255676
acc: [0.5798958539962769, 0.6274444460868835, 0.6585555672645569, 0.6753125190734863, 0.687340259552002, 0.6954097151756287, 0.7028124928474426, 0.7062777876853943, 0.710812509059906, 0.7120416760444641]
val_acc: [0.6446874737739563, 0.6654999852180481, 0.6838750243186951, 0.703249990940094, 0.7049375176429749, 0.7086250185966492, 0.706250011920929, 0.7124375104904175, 0.719124972820282, 0.7201250195503235]
loss: [0.6712179183959961, 0.641024112701416, 0.614949643611908, 0.5955849885940552, 0.5826506018638611, 0.5734885931015015, 0.5659439563751221, 0.5615335702896118, 0.5557126998901367, 0.5527966022491455]
val_loss: [0.6299454569816589, 0.6039350032806396, 0.5820632576942444, 0.563909649848938, 0.5593340992927551, 0.5543398857116699, 0.5563884377479553, 0.5512877106666565, 0.5447803139686584, 0.5435177087783813]


In [18]:
#SAVING 
save_dir = '/content/drive/MyDrive/nns/saved_nn_models/'
model_name = "LSTM"
model.save(save_dir+model_name+".h5")


In [19]:
!ls /content/drive/MyDrive/nns/saved_nn_models/

FFNN1.h5  FFNN3.h5  FFNN5.h5  LSTM.h5  RNN.h5
