In [2]:
#Importing required libraries
import pandas as pd
import numpy as np
import nltk
import re
import string

#CNN Imports
import os
import sys
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Input, Embedding, Reshape, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import MaxPool1D
from keras.models import Model
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from keras import regularizers
from keras import optimizers
from keras.models import Sequential
import pickle

In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [4]:
import sys
import os
prefix = "/content/gdrive/My Drive/NLP Assignments/"
sys.path.append(prefix)

In [5]:
#Reading Train, validation & Test dataset
train_filename = prefix+"Train.csv"
train_reviews =pd.read_csv(train_filename)

valid_filename = prefix+"Valid.csv"
valid_reviews =pd.read_csv(valid_filename)

test_filename = prefix+"Test.csv"
test_reviews =pd.read_csv(test_filename)

In [7]:
# Data preprocessing
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

#X_train, X_test, y_train, y_test = train_test_split(reviews['clean_text2'],
                                                    #reviews['label'], test_size=0.2, random_state = 1000)
train_reviews['clean_text2'] = train_reviews['text'].apply(lambda x: clean_text(x))
valid_reviews['clean_text2'] = valid_reviews['text'].apply(lambda x: clean_text(x))
test_reviews['clean_text2'] = test_reviews['text'].apply(lambda x: clean_text(x))

X_train, y_train= train_reviews['clean_text2'], train_reviews['label']
X_valid, y_valid= valid_reviews['clean_text2'], valid_reviews['label']
X_test, y_test = test_reviews['clean_text2'], test_reviews['label']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
#Feature Engineering to decide max words & padding sequence length
import re
nltk.download('punkt')
df = pd.concat([train_reviews, valid_reviews])
def count_words(text):
    return len(text.split())

def count_sent(text):
    return len(nltk.sent_tokenize(text)) 

df['word_count'] = df["text"].apply(lambda x:count_words(x))
df['sent_count'] = df["text"].apply(lambda x:count_sent(x))
df['avg_sentlength'] = df['word_count']/df['sent_count']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
#Setting hyperparameters
MAX_WORDS = 2500
MAX_SEQUENCE_LENGTH = 734

EMBEDDING_DIM = 100
embedding_dim = 100
random_state = 1000
drop = 0.5
batch_size = 32
epochs = 5

In [11]:
#Tokenizing
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(X_train)

# saving tokenizer in pickle file which can be used later during testing
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
X_train_seq =  tokenizer.texts_to_sequences(X_train)
X_valid_seq =  tokenizer.texts_to_sequences(X_valid)
X_test_seq = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

X_train_data = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_valid_data = pad_sequences(X_valid_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_data = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = to_categorical(np.asarray(y_train))
labels_valid = to_categorical(np.asarray(y_valid))
labels_test = to_categorical(np.asarray(y_test))

# Shapes of traing, valid and test tensors
print('Shape of training data tensor:', X_train_data.shape)
print('Shape of validating data tensor:', X_valid_data.shape)
print('Shape of testing data tensor:', X_test_data.shape)

print('Shape of training label tensor:', labels_train.shape)
print('Shape of validating label tensor:', labels_valid.shape)
print('Shape of testing label tensor:', labels_test.shape)

unique words : 124271
Shape of training data tensor: (40000, 734)
Shape of validating data tensor: (5000, 734)
Shape of testing data tensor: (5000, 734)
Shape of training label tensor: (40000, 2)
Shape of validating label tensor: (5000, 2)
Shape of testing label tensor: (5000, 2)


In [13]:
#Using Pre-Trained GLOVE Embeddings https://nlp.stanford.edu/projects/glove/

embeddings_index = {}
Glove_filename = prefix+"glove.6B.100d.txt"

f = open(os.path.join(Glove_filename), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [14]:
#Creating embedding matrix & embedding layer
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector # words not found in embedding index will be all-zeros.


embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [15]:
#CNN layers

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

l_cov1= Conv1D(128, 1, activation='relu' ,padding='valid', kernel_initializer='normal')(embedded_sequences) 
l_cov2 = Conv1D(128, 2, activation='relu' ,padding='valid', kernel_initializer='normal')(l_cov1) # (l_pool1)
g_pool = GlobalMaxPooling1D()(l_cov2)
dropout = Dropout(0.5)(g_pool)
preds = Dense(1, activation='sigmoid')(dropout)

# this creates a model
model = Model(inputs=sequence_input, outputs=preds)
checkpoint = ModelCheckpoint('weights_cnn_sentence.hdf5', monitor='val_accuracy', verbose=1, save_best_only= True, mode='auto')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 734)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 734, 100)          12427200  
_________________________________________________________________
conv1d (Conv1D)              (None, 734, 128)          12928     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 733, 128)          32896     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129   

In [16]:
#Traing CNN model
print("Training Model...")
model.fit(X_train_data, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_valid_data, y_valid))

Training Model...
Epoch 1/5

Epoch 00001: val_accuracy improved from -inf to 0.85980, saving model to weights_cnn_sentence.hdf5
Epoch 2/5

Epoch 00002: val_accuracy improved from 0.85980 to 0.87300, saving model to weights_cnn_sentence.hdf5
Epoch 3/5

Epoch 00003: val_accuracy did not improve from 0.87300
Epoch 4/5

Epoch 00004: val_accuracy improved from 0.87300 to 0.88120, saving model to weights_cnn_sentence.hdf5
Epoch 5/5

Epoch 00005: val_accuracy did not improve from 0.88120


<keras.callbacks.History at 0x7fc4afc47890>

In [21]:
#Saving model to json
!pip install simplejson
import simplejson
import json
model_json = model.to_json()
json_filename = prefix+"bestmodel.json"
with open(json_filename, "w") as json_file:
    json_file.write(simplejson.dumps(simplejson.loads(model_json), indent=4))
json_file.close()



In [23]:
#Saving weights in h5 file
model.save_weights("bestmodel.h5")
print("Saved model to disk")

Saved model to disk


In [25]:
json_path = prefix + "bestmodel.json"
json_file = open(json_path, "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [27]:
#Loading saved model for testing
h5_filepath = prefix + "bestmodel.h5"
loaded_model.load_weights(h5_filepath)
print("Loaded model from disk")

Loaded model from disk


In [28]:
#Calculated accuracy on loaded model
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X_test_data, y_test, verbose=0)
print ("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

accuracy: 87.68%


In [29]:
#Calculating evalation parameters
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
history = model.fit(X_train_data, y_train, validation_split=0.3, epochs=1, verbose=0)

# evaluate the model
loss_train, accuracy_train, f1_score_train, precision_train, recall_train = model.evaluate(X_train_data, y_train, verbose=0)
loss_valid, accuracy_valid, f1_score_valid, precision_valid, recall_valid = model.evaluate(X_valid_data, y_valid, verbose=0)
loss_test, accuracy_test, f1_score_test, precision_test, recall_test = model.evaluate(X_test_data, y_test, verbose=0)

In [30]:
print('F1-score on training data',f1_score_train)
print('F1-score on validation data',f1_score_valid)
print('F1-score on test data',f1_score_test)

F1-score on training data 0.9272326231002808
F1-score on validation data 0.877922773361206
F1-score on test data 0.8698582053184509
