In [1]:
# Author - entire code written by Krishna Sirisha Motamarry
# Import Necessary libraries
import keras
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from gensim.models import Word2Vec;

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Reading train data file
header = ['label','comment']
data = pd.read_table('train-balanced.csv',
                    sep='|', 
                   # delimiter=',', 
                    names=header,
                    usecols=[0,1],
                   # usecols=[0,1,9],
                   # dtype={'label':int,'comment':str,'parent_comment':str},
                    header=0,
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [None]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [4]:
# Replacing the slangs and converting the comments to sequences
from keras.preprocessing.text import text_to_word_sequence
def comment_clean(user_comment):
    comment_words = re.sub(r"[^a-zA-Z0-9\s\']","",user_comment)         
    comment_words=comment_words.split()
    #print(comment_words)
    for word in comment_words:
        if word.upper() in slangdict.keys():
            user_comment = user_comment.replace(word.upper(),slangdict[word.upper()])
        elif word in slangdict.keys():
            user_comment = user_comment.replace(word,slangdict[word]) 
    #print(user_comment)        
    result = text_to_word_sequence(user_comment,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ')
    #print(result)
    return result

In [5]:
#clean each comment in training data
import time
start_time = time.time()
data['comment'] = data.comment.apply(comment_clean)
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  10.039142608642578


In [6]:
# Creating the word2vec embedding model
usercomment = data['comment'].values.tolist()
embedding_dim = 100
model = Word2Vec(usercomment, size=embedding_dim, window=5,workers=4, min_count=1)
words = list(model.wv.vocab)
print(len(words))


166639


In [7]:
# Saving the model to a file
file = "word2vec_embedding1.txt"
model.wv.save_word2vec_format(file,binary=False)

In [8]:
# Tokeninzing training data
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(usercomment)
seq = tokenizer1.texts_to_sequences(usercomment)

In [9]:
word_index = tokenizer1.word_index
print(len(word_index))

166639


In [10]:
# Generation pad sequences for comments
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
comment_pad = pad_sequences(seq, maxlen=2000)

In [11]:
# Label data for training data
labeldata = data['label'].values
print(labeldata.shape)
print(comment_pad.shape)

(978039,)
(978039, 2000)


In [12]:
# Creating embedding dictionary
import os
import numpy as np
embeddings_dict = {}
f = open("word2vec_embedding1.txt", encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_dict[word] = coefs
f.close()

In [13]:
# Creating embedding matrix for the words
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, 100))

for word,i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
# Creating 20% as validation data out of training data to initially run the model
validation_split = 0.2
indices = np.arange(comment_pad.shape[0])
np.random.shuffle(indices)
comment_pad = comment_pad[indices]
labeldata = labeldata[indices]
validation_samples = int(validation_split*comment_pad.shape[0])

X_train_pad = comment_pad[:-validation_samples]
y_train = labeldata[:-validation_samples]
X_test_pad = comment_pad[-validation_samples:]
y_test = labeldata[-validation_samples:]

print(X_train_pad.shape)
print(y_train.shape)
print(X_test_pad.shape)
print(y_test.shape)

(782432, 2000)
(782432,)
(195607, 2000)
(195607,)


In [17]:
# Created LSTM model with embedding layer, LSTM and Dense layer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras import layers

model = Sequential()
embedding_layer = Embedding(num_words,embedding_dim,embeddings_initializer= Constant(embedding_matrix),input_length=2000,trainable=False)
model.add(embedding_layer)
model.add(LSTM(100))
#model.add(LSTM(128)) 
model.add(layers.Dense(1, activation='sigmoid'))
#model.add(layers.Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(X_train_pad, y_train, epochs=3, batch_size=512,verbose=1)
results = model.evaluate(X_test_pad, y_test)
print(results)

Epoch 1/3
   512/782432 [..............................] - ETA: 23:52:31 - loss: 0.7207 - acc: 0.4121

KeyboardInterrupt: 