In [None]:
#  Run this to ensure TensorFlow 2.x is used
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense,LSTM,Bidirectional,Conv1D,MaxPooling1D,GlobalMaxPooling1D
from keras.models import Sequential
from keras import regularizers
import re
import gensim
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from keras.callbacks import ModelCheckpoint

In [None]:
training_data = pd.read_csv('train.tsv',sep = '\t')
test_data = pd.read_csv('test.tsv',sep = '\t')


In [None]:
training_data = training_data[['Phrase','Sentiment']]
test_data = test_data[['Phrase']]

In [None]:
def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [None]:
temp = []
#Splitting pd.Series to list
data_to_list = training_data['Phrase'].values.tolist()
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))
list(temp[:5])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'A series of escapades demonstrating the adage that what is good for the goose',
 'A series',
 'A',
 'series']

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        

data_words = list(sent_to_words(temp))

print(data_words[:10])

[['series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'story'], ['series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose'], ['series'], [], ['series'], ['of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose'], ['of'], ['escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose'], ['escapades'], ['demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose']]


In [None]:
len(data_words)

156060

In [None]:
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [None]:
data = []
for i in range(len(data_words)):
    data.append(detokenize(data_words[i]))
print(data[:5])

['series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of story', 'series of escapades demonstrating the adage that what is good for the goose', 'series', '', 'series']


In [None]:
data_sent = np.array(data)

In [None]:
data_labels = training_data['Sentiment'].values.tolist()
data_labels = np.array(data_labels)
data_labels = tf.keras.utils.to_categorical(data_labels, 5, dtype="float32")

In [None]:
#play with these to see their effect
vocab_size = 5000
embedding_dim = 20
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_size = 100000

In [None]:
def preprocessing(sent_list,labels):
  tokenizer = Tokenizer(num_words = vocab_size,oov_token = oov_tok)
  tokenizer.fit_on_texts(sent_list)
  
  word_index = tokenizer.word_index

  sent_sequences = tokenizer.texts_to_sequences(sent_list)
  sent_padded = pad_sequences(sent_sequences,maxlen = max_length, padding = padding_type, truncating = trunc_type)
  sent_padded = np.array(sent_padded)
  labels = np.array(labels)
  return sent_padded,labels

In [None]:
training_sent = data_sent[0:training_size]
training_lab = data_labels[0:training_size]
val_sent = data_sent[training_size:]
val_lab = data_labels[training_size:]

In [None]:
training_padded,training_labels = preprocessing(training_sent,training_lab)

In [None]:
val_padded,val_labels = preprocessing(val_sent,val_lab)

In [None]:
print(training_labels[0])

[0. 1. 0. 0. 0.]


In [None]:
model1 = Sequential()
model1.add(Embedding(vocab_size,20))
model1.add(LSTM(15,dropout = 0.5))
model1.add(Dense(5,activation = 'softmax'))

In [None]:
model1.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop',metrics = 'accuracy')

In [None]:
model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 20)          100000    
                                                                 
 lstm_6 (LSTM)               (None, 15)                2160      
                                                                 
 dense_6 (Dense)             (None, 5)                 80        
                                                                 
Total params: 102,240
Trainable params: 102,240
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 30
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(val_padded, val_labels), verbose=1,callbacks=[checkpoint1])





Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.49588, saving model to best_model1.hdf5
Epoch 2/30
Epoch 2: val_accuracy did not improve from 0.49588
Epoch 3/30
Epoch 3: val_accuracy did not improve from 0.49588
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.49588
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.49588
Epoch 6/30
Epoch 6: val_accuracy did not improve from 0.49588
Epoch 7/30
Epoch 7: val_accuracy did not improve from 0.49588
Epoch 8/30
Epoch 8: val_accuracy did not improve from 0.49588
Epoch 9/30
Epoch 9: val_accuracy did not improve from 0.49588
Epoch 10/30
Epoch 10: val_accuracy did not improve from 0.49588
Epoch 11/30
Epoch 11: val_accuracy did not improve from 0.49588
Epoch 12/30
Epoch 12: val_accuracy did not improve from 0.49588
Epoch 13/30
Epoch 13: val_accuracy did not improve from 0.49588
Epoch 14/30
Epoch 14: val_accuracy did not improve from 0.49588
Epoch 15/30
Epoch 15: val_accuracy did not improve from 0.49588
Epoch 16/30
Epoch 16: v

In [None]:
model2 = Sequential()
model2.add(Embedding(vocab_size,40,input_length = max_length))
model2.add(Bidirectional(LSTM(20,dropout=0.7)))
model2.add(Dense(5,activation = 'softmax'))

In [None]:
model2.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop',metrics = 'accuracy')

In [None]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 40)           200000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 5)                 205       
                                                                 
Total params: 209,965
Trainable params: 209,965
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 30
checkpoint2 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(val_padded, val_labels), verbose=1,callbacks=[checkpoint2])



Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.48109, saving model to best_model1.hdf5
Epoch 2/30
Epoch 2: val_accuracy did not improve from 0.48109
Epoch 3/30
Epoch 3: val_accuracy did not improve from 0.48109
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.48109
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.48109
Epoch 6/30
Epoch 6: val_accuracy did not improve from 0.48109
Epoch 7/30
Epoch 7: val_accuracy did not improve from 0.48109
Epoch 8/30
Epoch 8: val_accuracy did not improve from 0.48109
Epoch 9/30
Epoch 9: val_accuracy did not improve from 0.48109
Epoch 10/30
Epoch 10: val_accuracy did not improve from 0.48109
Epoch 11/30
Epoch 11: val_accuracy did not improve from 0.48109
Epoch 12/30
Epoch 12: val_accuracy did not improve from 0.48109
Epoch 13/30
Epoch 13: val_accuracy did not improve from 0.48109
Epoch 14/30
Epoch 14: val_accuracy did not improve from 0.48109
Epoch 15/30
Epoch 15: val_accuracy did not improve from 0.48109
Epoch 16/30
Epoch 16: v

In [None]:
model3 = Sequential()
model3.add(Embedding(vocab_size,40,input_length = max_length))
model3.add(Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(MaxPooling1D(5))
model3.add(Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(5,activation='softmax'))

In [None]:
model3.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['acc'])

In [None]:
history = model3.fit(training_padded, training_labels, epochs=30, validation_data=(val_padded, val_labels), verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
