In [1]:
# importing all the libraries we will need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,RNN,GRU,Embedding,Flatten,Dropout,Dense,SimpleRNN,SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec

In [2]:
# checking for the Gpu
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please ensure you have installed TensorFlow correctly')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

Default GPU Device: /device:GPU:0


In [3]:
# reading the train dataset and spliting it into X and Y
train_dataset = pd.read_csv(r"D:\Data\New folder\train.csv")
train_comments = train_dataset.iloc[:,1].values   # the training data in one dim array
train_labels = train_dataset.iloc[:,2:].values          # the training labels in 2 dim array

# reading the test dataset without labels
test_inputs = pd.read_csv(r"D:\Data\New folder\test.csv")
test_comments = test_inputs.iloc[:,1:].values

# reading the testset's labels
testLabels = pd.read_csv(r"D:\Data\New folder\test_labels.csv")
test_labels = testLabels.iloc[:,1:].values

# # merging the two sets together for easier use
test_set = pd.merge(test_inputs,testLabels,how='left',on='id')

In [4]:
# making a sum attribute to sum all the values
test_set['sum'] = test_set['toxic'] + test_set['severe_toxic'] + test_set['obscene'] + test_set['threat'] \
+ test_set['insult'] + test_set['identity_hate']

#droping the id attribute
test_set.drop('id',axis=1, inplace=True)

merge = test_set[test_set['sum'] != -6]

Y_test = merge.iloc[:,1:7].values
X_test = merge.iloc[:,0].values

In [5]:
# putting every comment in tokens
tokens = []
tokens = [word_tokenize(str(sentence)) for sentence in train_comments]

# replacing all non characters by " " then spliting on the " "
reg = []
for word in tokens:
    s = re.sub('[^A-Za-z]'," ",str(word))
    x = re.split("\s",s)
    reg.append(x)

# removing all spaces
for i in reg:
    while '' in i:
        i.remove('')

# Lowercasing every word
low_case = []
for i in reg:
    i = [x.lower() for x in i]
    low_case.append(i)

# taking the root form of all the words
lemmatization = []
for sen in low_case:
    root = [WordNetLemmatizer().lemmatize(word) for word in sen]
    lemmatization.append(root)
    
# taking rid of all the stopwords    
filtered_sentences = []
Stopwords = set(stopwords.words('english'))
for sen in lemmatization:
    filterd_sen = [word for word in sen if word not in Stopwords]
    filtered_sentences.append(filterd_sen)

# taking words that are more than 2 letters
words = []
for sen in filtered_sentences:
    x = [word for word in sen if len(word) > 2]
    words.append(x)

#converting the words into vectors
model_wv = Word2Vec(words)
words_vec = model_wv.wv
vocab = words_vec.vocab.items()

embed_matrix = words_vec.vectors
word_to_id = {k:v.index for k,v in vocab}

# grouping the unkown words
UNKs = 0
UNK_index = 0
UNK_token = "UNK"
UNK_vec = embed_matrix.mean(axis=0)

embed_matrix = np.insert(embed_matrix,[UNK_index],[UNK_vec],axis=0)
word_to_id = {word:(index+1) if index >= UNK_index else index for word,index in word_to_id.items()}

word_to_id[UNK_token] = UNK_index

# taking the word's index that is known else take the unknown index 
sequences = []
for sen in words:
    x = []
    for word in sen:
        if word in word_to_id:
            x.append(word_to_id.get(word))
        else:
            x.append(UNK_index)
            UNKs+=1
    sequences.append(x)

# padding the seq to 100    
train_X = pad_sequences(sequences,maxlen=100,padding="post")

In [6]:
# filtring the non characters and numbers
x_test = []
for sentence in X_test:
    x_test.append(text_to_word_sequence(str(sentence),filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '))

# filtring the stopwords    
filter_test = []
for sent in x_test:
    tokens = [w for w in sent if w not in Stopwords]
    filter_test.append(tokens)

# Converting the text into sequences using 
L = []
for sent in x_test:
    Z = []
    for word in sent:
        if word in word_to_id and len(word)>2:
            Z.append(word_to_id.get(word))
        else:
            Z.append(UNK_index)
            UNKs+=1
    L.append(Z)

test_data = pad_sequences(L, maxlen=100, padding= 'post',dtype='float')

In [7]:
# building the model
input_len = len(embed_matrix)

model = Sequential()
model.add(Embedding(input_len,100,input_length=100,weights=[embed_matrix]))
model.add(LSTM(120,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(64,activation='tanh'))
model.add(Dense(6,activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          3664400   
_________________________________________________________________
lstm (LSTM)                  (None, 120)               106080    
_________________________________________________________________
dropout (Dropout)            (None, 120)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                7744      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 3,778,614
Trainable params: 3,778,614
Non-trainable params: 0
_________________________________________________________________


In [8]:
# compiling the model choosing adam as the optimizer and crossentropy as the loss function
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [9]:
# training the model 
epochs = 5      # with low number of epochs
batch_size = 256
trained_model = model.fit(train_X,train_labels,epochs=epochs,batch_size=batch_size)

Train on 159571 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# saving the model
# model.save("model_01.h5")

In [11]:
# evaluating
model.evaluate(test_data,Y_test)



[0.35841593709502245, 0.99760854]