In [None]:
# Mounting drive that has access to dataset
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Installing keras
!pip install -q keras

In [None]:
# Importing necessary libraries
import keras
import tensorflow as tf
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import string
import numpy as np
import re

In [None]:
# Specifying the path of the dataset
train_path = '/content/gdrive/MyDrive/COM3025_Coursework/train.csv'
test_path = '/content/gdrive/MyDrive/COM3025_Coursework/test.csv'

In [None]:
# Using a dataframe to store the dataset
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
train_df[0:10]

(27481, 4)


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive


In [None]:
# Remove NA values
train_df = train_df.dropna(axis = 0)
train_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [None]:
test_df = train_df.dropna(axis = 0)
test_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [None]:
# Removes punctations, symbols and web url from the text in the dataset
def clean_text(data):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    
    data = str(data).lower()
    data = re.sub('\[.*?\]', '', data)
    data = re.sub('https?://\S+|www\.\S+', '', data)
    data = re.sub('<.*?>+', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)
    data = re.sub('\n', '', data)
    data = re.sub('\w*\d\w*', '', data)
    return data


In [None]:
# Clean Training Data
train_df["text"] = train_df["text"].apply(lambda x:clean_text(x))
train_df["selected_text"] = train_df["selected_text"].apply(lambda x:clean_text(x))
train_df[0:10]

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative
5,28b57f3990,some shameless plugging for the best rangers...,some shameless plugging for the best rangers...,neutral
6,6e0c6d75b1,feedings for the baby are fun when he is all ...,fun,positive
7,50e14c0bb8,soooo high,soooo high,neutral
8,e050245fbd,both of you,both of you,neutral
9,fc2cbefa9d,journey wow u just became cooler hehe is tha...,wow u just became cooler,positive


In [None]:
# Clean Testing Data
test_df["text"] = test_df["text"].apply(lambda x:clean_text(x))
test_df[0:10]

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,"I`d have responded, if I were going",neutral
1,549e992a42,sooo sad i will miss you here in san diego,Sooo SAD,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,"Sons of ****,",negative
5,28b57f3990,some shameless plugging for the best rangers...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,feedings for the baby are fun when he is all ...,fun,positive
7,50e14c0bb8,soooo high,Soooo high,neutral
8,e050245fbd,both of you,Both of you,neutral
9,fc2cbefa9d,journey wow u just became cooler hehe is tha...,Wow... u just became cooler.,positive


In [None]:
# Seperate train, validation and testing fields in different lists
val_percent = 0.8
num_rows = train_df.shape[0]
train_rows = round(num_rows*val_percent)

train_text = train_df["text"][0:train_rows]
train_sent = train_df["sentiment"][0:train_rows]
train_targ = train_df["selected_text"][0:train_rows]

val_text = train_df["text"][train_rows:num_rows]
val_sent = train_df["sentiment"][train_rows:num_rows]
val_targ = train_df["selected_text"][train_rows:num_rows]

test_text = test_df["text"]
test_sent = test_df["sentiment"]

In [None]:
# Encodes the sentiment values to a range of -1 to 1
def sentiment_to_val(sentiment):
  if(sentiment == 'positive'):
    return 1
  elif(sentiment == 'negative'):
    return -1
  else:
    return 0
    
train_sent = train_sent.apply(sentiment_to_val)
train_sent = train_sent.to_numpy()
train_sent = pd.get_dummies(train_sent).values

test_sent = test_sent.apply(sentiment_to_val)
test_sent = test_sent.to_numpy()
test_sent = pd.get_dummies(test_sent).values

In [None]:
# Import keras layers and tools
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

In [None]:
# Create a dictionary and tokenize the text from the dataset
max_review_length = 15
max_target_length = 5
corpus_size = 50000
tokenizer = Tokenizer(num_words=corpus_size)
tokenizer.fit_on_texts(train_df["text"])

train_x = tokenizer.texts_to_sequences(train_text)
train_x = sequence.pad_sequences(train_x, maxlen= max_review_length)
print('Shape of input data:', train_x.shape)

train_y = tokenizer.texts_to_sequences(train_targ)
train_y = sequence.pad_sequences(train_y, maxlen= max_target_length)
print('Shape of target data:', train_y.shape)

Shape of input data: (21984, 15)
Shape of target data: (21984, 5)


In [None]:
# Normalize values
max_val = 9999
min_val = 0
def NormalizeData(data):
    return (data - min_val) / max_val - min_val

def DenormalizeData(data):
    return data * (max_val - min_val) + min_val

train_x_scale = NormalizeData(train_x)
train_y_scale = NormalizeData(train_y)

In [None]:
# Create LSTM Model
EMBEDDING_DIM = 1024
model = Sequential()
model.add(Embedding(corpus_size, EMBEDDING_DIM, input_length=max_review_length, mask_zero=True))
model.add(LSTM(2048, dropout=0.2, return_sequences=True))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(max_target_length, activation='relu'))
opt = keras.optimizers.RMSprop(lr=0.005, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 1024)          51200000  
_________________________________________________________________
lstm (LSTM)                  (None, 15, 2048)          25174016  
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               5244928   
_________________________________________________________________
dense (Dense)                (None, 5)                 2565      
Total params: 81,621,509
Trainable params: 81,621,509
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Train Model
epochs = 10
batch_size = 128
model.fit(train_x_scale, train_y_scale, 
          epochs=epochs, 
          batch_size=batch_size,
          validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faae0202d90>

In [None]:
# Tokenizing validation set
val_x = tokenizer.texts_to_sequences(val_text)
val_x = sequence.pad_sequences(val_x, maxlen= max_review_length)
print('Shape of input data:', val_x.shape)

val_y = tokenizer.texts_to_sequences(val_targ)
val_y = sequence.pad_sequences(val_y, maxlen= max_target_length)
print('Shape of target data:', val_y.shape)

val_x_scale = NormalizeData(val_x)
val_y_scale = NormalizeData(val_y)

# Evaluating model
loss, acc = model.evaluate(val_x_scale, val_y_scale, verbose=2, batch_size=batch_size)
print(f"loss: {loss}")
print(f"Validation accuracy: {acc}")

Shape of input data: (5496, 15)
Shape of target data: (5496, 5)
43/43 - 5s - loss: 0.3709 - accuracy: 0.5182
loss: 0.37087470293045044
Validation accuracy: 0.5181950330734253


In [None]:
# Tokenizing testing set
test_x = tokenizer.texts_to_sequences(test_text)
test_x = sequence.pad_sequences(test_x, maxlen= max_review_length)
print('Shape of input data:', test_x.shape)

test_x_scale = NormalizeData(test_x)

Shape of input data: (3534, 15)


In [None]:
# Prediction on test dataset using model
pred = model.predict(test_x_scale)
pred_rounded = np.round(pred)
pred_text = tokenizer.sequences_to_texts(pred_rounded)
pred_text[0]

'in for me so like'

In [None]:
def jaccard(str1, str2):
  # Returns the Jaccard metric which is the similarity between str1 and str2
  a = set(str1.lower().split())
  b = set(str2.lower().split())
  c = a.intersection(b)

  numerator = len(c)
  denominator = len(a)+len(b)-len(c)

  if(denominator != 0):
    return float (numerator/denominator)
  else:
    return 0

In [None]:
val_text_list = val_text.to_list()
val_targ_list = val_targ.to_list()

In [None]:
jacc_scores = []
for i in range(len(val_text_list)):
  jacc_scores.append(jaccard(val_text_list[i], val_targ_list[i]))

print(len(jacc_scores))
print(np.mean(jacc_scores))

5496
0.603229321631829
