In [1]:
# Author - entire code written by Krishna Sirisha Motamarry
# Importing Keras
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Import necessary libraries
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from gensim.models import Word2Vec;



In [3]:
# Read the training data from the csv file
header = ['label','comment']
data = pd.read_table('train-balanced.csv',
                    sep='\t', 
                   # delimiter=',', 
                    names=header,
                    usecols=[0,1],
                   # usecols=[0,1,9],
                   # dtype={'label':int,'comment':str,'parent_comment':str},
                     dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [4]:
#read the test data from the csv file
header = ['label','comment']
testdata = pd.read_table('test-balanced.csv',
                    sep='\t', 
                   # delimiter=',', 
                    names=header,
                    usecols=[0,1],
                   # usecols=[0,1,9],
                   # dtype={'label':int,'comment':str,'parent_comment':str},
                     dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [None]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [6]:
# Replacing the slangs and converting the comments to sequences
from keras.preprocessing.text import text_to_word_sequence
def comment_clean(user_comment):
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word])         
    result = text_to_word_sequence(user_comment,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ')
    return result

In [7]:
#clean each comment in train and test data
import time
start_time = time.time()
data['comment'] = data.comment.apply(comment_clean)
testdata['comment'] = testdata.comment.apply(comment_clean)
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  48.830397844314575


In [8]:
#Concatening the train and test data to create embeddings
frames = [data,testdata]
word2vecinput = pd.concat(frames)

In [10]:
#Converting the data to lists
usercomment1 = data['comment'].values.tolist()
usercomment2 = testdata['comment'].values.tolist()
usercomment = word2vecinput['comment'].values.tolist()

In [13]:
# Creating the word2vec embedding model
embedding_dim = 100
model = Word2Vec(usercomment, size=embedding_dim, window=5,workers=4, min_count=1)
words = list(model.wv.vocab)
print(len(words))


207340


In [14]:
# Saving the model to a file
file = "word2vec_embedding_actualtraintestdata.txt"
model.wv.save_word2vec_format(file,binary=False)

In [15]:
# Tokeninzing training data
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(usercomment1)
seq1 = tokenizer1.texts_to_sequences(usercomment1)

In [16]:
# Tokeninzing test data
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(usercomment2)
seq2 = tokenizer2.texts_to_sequences(usercomment2)

In [17]:
# Word Index of training data
word_index1 = tokenizer1.word_index
print(len(word_index1))

182577


In [18]:
# Word Index of test data
word_index2 = tokenizer2.word_index
print(len(word_index2))

85106


In [19]:
# Comments are padded with the sequences
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
comment_pad1 = pad_sequences(seq1, maxlen=2000)
comment_pad2 = pad_sequences(seq2, maxlen=2000)

In [20]:
# Creating label data for training data
labeldata1 = data['label'].values
print(labeldata1.shape)
print(comment_pad1.shape)

(1010826,)
(1010826, 2000)


In [21]:
# Creating label data for test data
labeldata2 = testdata['label'].values
print(labeldata2.shape)
print(comment_pad2.shape)

(251608,)
(251608, 2000)


In [22]:
# Creating embedding dictionary
import os
import numpy as np
embeddings_dict = {}
f = open("word2vec_embedding_actualtraintestdata.txt", encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_dict[word] = coefs
f.close()

In [23]:
# Creating an embedding matrix 
num_words = len(word_index1)+len(word_index2)+1
embedding_matrix = np.zeros((num_words, 100))

for word,i in word_index1.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
for word,i in word_index2.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [24]:
# Assigning train and test data
X_train_pad = comment_pad1
y_train=labeldata1
X_test_pad = comment_pad2
y_test = labeldata2

In [25]:
# Simple Neural Network with Word2Vec Embedding, 3 Dense layers
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.initializers import Constant
from keras import layers
from keras import callbacks

cb = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, mode='auto')

model = Sequential()
embedding_layer = Embedding(num_words,embedding_dim,embeddings_initializer= Constant(embedding_matrix),input_length=2000,trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(layers.Dense(64, activation='tanh'))
model.add(layers.Dense(64, activation='tanh'))
model.add(layers.Dense(1, activation='sigmoid'))
#model.add(layers.Dense(1, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='mse',
metrics=['accuracy'])
model.fit(X_train_pad, y_train, epochs=6, batch_size=512,verbose=1)#,callbacks=[cb])
results = model.evaluate(X_test_pad, y_test)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [28]:
# Model Summary
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         26768400  
_________________________________________________________________
flatten_1 (Flatten)          (None, 200000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                12800064  
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 39,572,689
Trainable params: 12,804,289
Non-trainable params: 26,768,400
_________________________________________________________________
None


In [29]:
# Printing Accuracy score
print(results[1]*100)

60.646720295251676


In [22]:
#from keras.models import Sequential
#from keras.layers import Dense, Embedding, GRU
#from keras.layers.embeddings import Embedding
#from keras.initializers import Constant

#model = Sequential()
#embedding_layer = Embedding(num_words,embedding_dim,embeddings_initializer= Constant(embedding_matrix),input_length=2000,trainable=False)
#model.add(embedding_layer)
#model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(1,activation='sigmoid'))

#model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         18257800  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 18,270,601
Trainable params: 12,801
Non-trainable params: 18,257,800
_________________________________________________________________
None
