In [473]:
import numpy as np
import pandas as pd

In [474]:
df = pd.read_csv('train.csv')

In [475]:
df.shape

(5000, 3)

In [476]:
df.head()

Unnamed: 0,id,comment_text,toxic
0,e617e2489abe9bca,"""\r\n\r\n A barnstar for you! \r\n\r\n The De...",0
1,9250cf637294e09d,"""\r\n\r\nThis seems unbalanced. whatever I ha...",0
2,ce1aa4592d5240ca,"Marya Dzmitruk was born in Minsk, Belarus in M...",0
3,48105766ff7f075b,"""\r\n\r\nTalkback\r\n\r\n Dear Celestia... """,0
4,0543d4f82e5470b6,New Categories \r\n\r\nI honestly think that w...,0


In [477]:
df = df.drop('id',axis=1)

Split Data into Training and Test Data

In [478]:
from sklearn.model_selection import train_test_split

In [479]:
X_train, X_test, y_train, y_test = train_test_split(
    df['comment_text'],
    df['toxic'],
    test_size=0.2, 
    random_state=42
)

In [480]:
X_train.shape

(4000,)

In [481]:
X_test.shape

(1000,)

# Build the Tokenizer

In [482]:
import tensorflow as tf

In [483]:
top_words = 10000 #Vocablury size
t = tf.keras.preprocessing.text.Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [484]:
#Fit tokenizer with actual training data
t.fit_on_texts(X_train.tolist())

In [485]:
#Vocabulary
t.word_index

{'the': 1,
 '\r': 2,
 'to': 3,
 'i': 4,
 'you': 5,
 'of': 6,
 'a': 7,
 'and': 8,
 'is': 9,
 'in': 10,
 'that': 11,
 'it': 12,
 'for': 13,
 'this': 14,
 'on': 15,
 'not': 16,
 'be': 17,
 'as': 18,
 'are': 19,
 'have': 20,
 'your': 21,
 'if': 22,
 'was': 23,
 'with': 24,
 'or': 25,
 'article': 26,
 'but': 27,
 'my': 28,
 'page': 29,
 'wikipedia': 30,
 'an': 31,
 'by': 32,
 'do': 33,
 'at': 34,
 'talk': 35,
 'from': 36,
 'ass': 37,
 'so': 38,
 'about': 39,
 'me': 40,
 'fuck': 41,
 'can': 42,
 'please': 43,
 'there': 44,
 'what': 45,
 'would': 46,
 'has': 47,
 'all': 48,
 'no': 49,
 'will': 50,
 'one': 51,
 'he': 52,
 'just': 53,
 'like': 54,
 'they': 55,
 'been': 56,
 'should': 57,
 'any': 58,
 'which': 59,
 "don't": 60,
 'am': 61,
 'more': 62,
 'some': 63,
 'we': 64,
 'here': 65,
 'also': 66,
 'his': 67,
 'think': 68,
 'who': 69,
 'know': 70,
 'name': 71,
 'see': 72,
 'other': 73,
 'edit': 74,
 'why': 75,
 'how': 76,
 'use': 77,
 'may': 78,
 'when': 79,
 'up': 80,
 'only': 81,
 "i'm": 82

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [486]:
X_train[0:1]

4227    , 5 November 2007 (UTC)\r\n\r\nI don't think I...
Name: comment_text, dtype: object

In [487]:
X_train = t.texts_to_sequences(X_train.tolist())

In [488]:
X_train[0:1]

[[418,
  1096,
  520,
  165,
  2,
  2,
  4,
  60,
  68,
  82,
  1,
  51,
  135,
  10,
  128,
  6,
  7,
  3824,
  39,
  16,
  1472,
  1,
  13,
  1,
  2,
  2,
  155,
  22,
  5,
  119,
  3,
  152,
  15,
  1,
  681,
  6,
  28,
  4,
  42,
  311,
  35,
  3,
  5,
  39,
  12,
  2,
  2,
  27,
  22,
  5,
  119,
  3,
  33,
  38,
  121,
  64,
  128,
  7,
  1097,
  2549,
  15,
  59,
  3,
  2015,
  1,
  942,
  843,
  1408,
  3,
  45,
  9,
  455,
  39,
  75,
  4,
  23,
  178,
  801,
  2,
  4443,
  178,
  40,
  87,
  4,
  3825,
  10,
  7,
  159,
  5389,
  3,
  7,
  943,
  1473,
  15,
  7,
  2550,
  1409,
  11,
  3043,
  23,
  1257,
  8,
  10,
  67,
  682,
  863,
  40,
  64,
  70,
  11,
  87,
  4443,
  581,
  153,
  65,
  8,
  190,
  11,
  23,
  75,
  52,
  23,
  864,
  40,
  2,
  3043,
  178,
  40,
  87,
  4,
  99,
  16,
  1709,
  15,
  30,
  3379,
  3380,
  64,
  70,
  11,
  87,
  52,
  581,
  153,
  65,
  8,
  190,
  11,
  23,
  75,
  52,
  23,
  864,
  40,
  2,
  64,
  66,
  70,
  12,
  87,
  3043,

In [489]:
X_test = t.texts_to_sequences(X_test.tolist())

How many words in each review?

# Pad Sequences - Important

In [490]:
#Define maximum number of words to consider in each review
max_review_length = 300

In [491]:
#Pad training and test reviews
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_review_length,
                                                        padding='pre')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, 
                                                       maxlen=max_review_length, 
                                                       padding='pre')

In [492]:
X_train.shape

(4000, 300)

In [493]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,23,75,52,23,864,40,2,3043,178,40,87,4,99,16,1709,15,30,3379,3380,64,70,11,87,52,581,153,65,8,190,11,23,75,52,23,864,40,2,64,66,70,...,3,35,3,40,27,52,641,12,51,1302,404,93,3043,86,4443,86,106,7,200,1474,3,181,4,190,27,83,52,178,40,3,1410,40,36,1800,3,67,163,200,684,713
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4444,85,2,2,94,28,1475,5,4445
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1348,74,284,15,714,730,1618,2,2,6884
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,685,29,144,7,453,59,4,269,113,1098,3,1,153,219,1,3045,2180,2016,57,309,17,136,25,2766,3,1,3045,1260,277,27,4,269,106,12,804,45,61,4,315,243
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,109,681,15,1,405


In [494]:
X_test.shape

(1000, 300)

# Build the Graph

In [495]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [496]:
model.add(tf.keras.layers.Embedding(top_words + 1, #Vocablury size
                                    50, #Embedding size
                                    input_length=max_review_length) #Number of words in each review
          )

In [497]:
model.output

<KerasTensor: shape=(None, 300, 50) dtype=float32 (created by layer 'embedding')>

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [498]:
model.add(tf.keras.layers.Dropout(0.4))

In [499]:
model.add(tf.keras.layers.LSTM(64,dropout=0.4)) #RNN State - size of cell state and hidden state

In [500]:
model.output

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'lstm')>

Use Dense layer for output layer

In [501]:
model.add(tf.keras.layers.Dropout(0.4))

In [502]:
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [503]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy',tf.keras.metrics.Recall()])

In [504]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           500050    
_________________________________________________________________
dropout (Dropout)            (None, 300, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 529,555
Trainable params: 529,555
Non-trainable params: 0
_________________________________________________________________


# Execute the graph

In [505]:
model.fit(X_train,y_train,
          epochs=10,
          batch_size=64,          
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f363b39f110>

In [506]:
import pandas as pd, numpy as np
import re

In [507]:
train0 = pd.read_csv('train.csv')

In [508]:
train0.head()

Unnamed: 0,id,comment_text,toxic
0,e617e2489abe9bca,"""\r\n\r\n A barnstar for you! \r\n\r\n The De...",0
1,9250cf637294e09d,"""\r\n\r\nThis seems unbalanced. whatever I ha...",0
2,ce1aa4592d5240ca,"Marya Dzmitruk was born in Minsk, Belarus in M...",0
3,48105766ff7f075b,"""\r\n\r\nTalkback\r\n\r\n Dear Celestia... """,0
4,0543d4f82e5470b6,New Categories \r\n\r\nI honestly think that w...,0


In [509]:
train0.describe()

Unnamed: 0,toxic
count,5000.0
mean,0.0874
std,0.282449
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [510]:
#About 22% records are toxic
train0.describe().sum(axis=1)

count    5000.000000
mean        0.087400
std         0.282449
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
dtype: float64

#### Converting to list for easy manipulation

In [511]:
comments = train0.comment_text.values

In [512]:
len(comments)

5000

### Text clean up 
- Using Regular expressions, remove IP addresses  
- Using Regular expressions, remove URLs  
- Normalize the case  
- Remove stop words  
- Remove punctuations

Removing ip address

In [513]:
re.sub('[\d+\.{3}]\d+',"","My ip is 127.0.0.9, friend")

'My ip is , friend'

In [514]:
comments_noip = [re.sub('[\d+\.{3}]\d+',"",txt) for txt in comments]

Normalizing case

In [515]:
comments_lower = [txt.lower() for txt in comments_noip]

In [516]:
comments_lower[2:4]

['marya dzmitruk was born in minsk, belarus in march , . her mother, olga nikolaevna moroz was born in baranovichi, belarus and her father was born in brest, belarus. she is second child in the family. her parents divorced in  and soon after her father remarried and had two more children. \r\nmarya, at the age of 4 began doing gymnastics, but quit two years later because she was denied a medal in a competition, where her age was incorrectly marked. when she turned 6 years old, she got admitted to music school #4 in minsk, class of violin, and to public school # with piano classes as a main course. at the age of , marya starred in belarusfilm movie called “dunechka”. soon after she started to play in theatre and was featured in television shows. by  her mother decided to move to united states. in september of  marya went to her first american school, ingrid b. lacy middle school. she graduated in spring  and traveled back to belarus for 2 months. in august  she went to oceana high schoo

Remove URLs

In [517]:
comments_nourl = [re.sub("\w+://\S+","", txt) for txt in comments_lower]

In [518]:
comments_nourl = [txt.replace("\'","") for txt in comments_nourl]

Remove extra line breaks

#### Tokenize

In [519]:
from nltk.tokenize import word_tokenize

In [520]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [521]:
print(word_tokenize(comments_nourl[0]))

['``', 'a', 'barnstar', 'for', 'you', '!', 'the', 'defender', 'of', 'the', 'wiki', 'barnstar', 'i', 'like', 'your', 'edit', 'on', 'the', 'kayastha', 'page', '.', 'lets', 'form', 'a', 'solidarity', 'group', 'against', 'those', 'who', 'malign', 'the', 'article', 'and', 'its', 'subject', 'matter', '.', 'i', 'propose', 'the', 'folloing', 'name', 'for', 'the', 'group', '.', 'united', 'intellectuals', 'front', 'of', 'kayastha', 'ethinicty', 'against', 'racist', 'or', 'castist', 'abuse', '(', 'uifkearca', ')', '``']


In [522]:
comment_tokens = [word_tokenize(sent) for sent in comments_nourl]
print(comment_tokens[0])

['``', 'a', 'barnstar', 'for', 'you', '!', 'the', 'defender', 'of', 'the', 'wiki', 'barnstar', 'i', 'like', 'your', 'edit', 'on', 'the', 'kayastha', 'page', '.', 'lets', 'form', 'a', 'solidarity', 'group', 'against', 'those', 'who', 'malign', 'the', 'article', 'and', 'its', 'subject', 'matter', '.', 'i', 'propose', 'the', 'folloing', 'name', 'for', 'the', 'group', '.', 'united', 'intellectuals', 'front', 'of', 'kayastha', 'ethinicty', 'against', 'racist', 'or', 'castist', 'abuse', '(', 'uifkearca', ')', '``']


### Remove stop words and punctuations

In [523]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [524]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [525]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [526]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [527]:
del_stop(comment_tokens[1])

['seems',
 'unbalanced',
 'whatever',
 'said',
 'mathsci',
 'said',
 'far',
 'extreme',
 'unpleasant',
 'things',
 'mention',
 'others',
 'much',
 'greater',
 'frequency',
 'im',
 'happy',
 'reign',
 'thats',
 'youd',
 'like',
 'ruth',
 'told',
 'trying',
 'get',
 'mathsci',
 'pay',
 'attention',
 'stop',
 'uncivil',
 'would',
 'expect',
 'issue',
 'request',
 'mathsci',
 'intentionally',
 'unbalanced',
 'whatever',
 'reason',
 'please',
 'let',
 'know',
 'voluntarily',
 'close',
 'account',
 'move',
 'things',
 'like',
 'wikipedia',
 'lot',
 'contribute',
 'way',
 'point',
 'contributing',
 'project',
 'editors',
 'administrative',
 'leave',
 'aggressively',
 'rude',
 'im',
 'good',
 'editor',
 'dont',
 'really',
 'deserve',
 'people',
 'riding',
 'ass',
 'every',
 'time',
 'try',
 'certain',
 'things',
 'ill',
 'happily',
 'leave',
 'hands',
 'drama-prone',
 'thats',
 'think',
 'best',
 'ludwigs2']

In [528]:
comments_clean = [del_stop(sent) for sent in comment_tokens]

### Checking out the top terms in the data

In [529]:
from collections import Counter

In [530]:
term_list = []
for sent in comments_clean:
    term_list.extend(sent)

In [531]:
res = Counter(term_list)
res.most_common(20)

[('article', 1655),
 ('page', 1495),
 ('wikipedia', 1338),
 ('talk', 1171),
 ('please', 1038),
 ('ass', 986),
 ('would', 964),
 ('fuck', 907),
 ('one', 858),
 ('like', 836),
 ('dont', 780),
 ('also', 657),
 ('think', 630),
 ('see', 630),
 ('know', 595),
 ('im', 562),
 ('edit', 560),
 ('use', 549),
 ('articles', 549),
 ('people', 538)]

Contextual stop words - "article", "page", "wikipedia", "talk", "articles", "pages"

In [532]:
stop_context = ["article", "page", "wikipedia", "talk", "articles", "pages"]

In [533]:
stop_final = stop_final + stop_context

In [534]:
comments_clean = [del_stop(sent) for sent in comment_tokens]

In [535]:
comments_clean = [" ".join(sent) for sent in comments_clean]
comments_clean[:2]

['barnstar defender wiki barnstar like edit kayastha lets form solidarity group malign subject matter propose folloing name group united intellectuals front kayastha ethinicty racist castist abuse uifkearca',
 'seems unbalanced whatever said mathsci said far extreme unpleasant things mention others much greater frequency im happy reign thats youd like ruth told trying get mathsci pay attention stop uncivil would expect issue request mathsci intentionally unbalanced whatever reason please let know voluntarily close account move things like lot contribute way point contributing project editors administrative leave aggressively rude im good editor dont really deserve people riding ass every time try certain things ill happily leave hands drama-prone thats think best ludwigs2']

##### We'll apply this function later on the test set

### Separate X and Y and perform train test split, 70-30

In [536]:
len(comments_clean)

5000

In [537]:
X = comments_clean
y = train0.toxic

Train test split

In [538]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

# Build the Tokenizer

In [539]:
import tensorflow as tf

In [540]:
top_words = 10000 #Vocablury size
t = tf.keras.preprocessing.text.Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [541]:
#Fit tokenizer with actual training data
t.fit_on_texts(X_train)

In [542]:
#Vocabulary
t.word_index

{'ass': 1,
 'fuck': 2,
 'please': 3,
 'would': 4,
 'one': 5,
 'like': 6,
 'dont': 7,
 'also': 8,
 'name': 9,
 'know': 10,
 'think': 11,
 'see': 12,
 'edit': 13,
 'im': 14,
 'use': 15,
 'may': 16,
 'suck': 17,
 'mexicans': 18,
 'people': 19,
 'time': 20,
 'thanks': 21,
 'user': 22,
 'well': 23,
 'even': 24,
 'deletion': 25,
 'information': 26,
 'good': 27,
 'make': 28,
 'editing': 29,
 'image': 30,
 'get': 31,
 'edits': 32,
 'help': 33,
 'could': 34,
 'wp': 35,
 'want': 36,
 'ytmnd': 37,
 'first': 38,
 'thank': 39,
 'gay': 40,
 'way': 41,
 'new': 42,
 'sources': 43,
 'really': 44,
 'say': 45,
 'need': 46,
 'used': 47,
 'many': 48,
 'section': 49,
 'deleted': 50,
 'work': 51,
 'go': 52,
 'source': 53,
 'find': 54,
 'made': 55,
 'right': 56,
 'discussion': 57,
 'take': 58,
 'ive': 59,
 'read': 60,
 'since': 61,
 'point': 62,
 'youre': 63,
 'look': 64,
 'fucking': 65,
 'still': 66,
 'someone': 67,
 'fact': 68,
 'link': 69,
 'add': 70,
 'list': 71,
 'two': 72,
 'utc': 73,
 'editors': 74,
 '

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [543]:
X_train[0:1]

['p.s.you really something life looked around userpage anda monastic deacon catholic church catholic church wow things work good luck maybe could even one day rise rank archdeacon keyn eynhore sorry forgot speak english jolz gutentag goyishe kup']

In [544]:
X_train = t.texts_to_sequences(X_train)

In [545]:
X_train[0:1]

[[450,
  332,
  890,
  44,
  81,
  311,
  967,
  219,
  1350,
  8569,
  8570,
  8571,
  1351,
  1003,
  1351,
  1003,
  1640,
  122,
  51,
  27,
  826,
  276,
  34,
  24,
  5,
  220,
  3802,
  4542,
  8572,
  8573,
  8574,
  130,
  1352,
  795,
  177,
  8575,
  8576,
  8577,
  8578]]

In [546]:
X_test = t.texts_to_sequences(X_test)

How many words in each review?

# Pad Sequences - Important

In [547]:
#Define maximum number of words to consider in each review
max_review_length = 100

In [548]:
#Pad training and test reviews
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_review_length,
                                                        padding='pre')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, 
                                                       maxlen=max_review_length, 
                                                       padding='pre')

In [549]:
X_train.shape

(3500, 100)

In [550]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,450,332,890,44,81,311,967,219,1350,8569,8570,8571,1351,1003,1351,1003,1640,122,51,27,826,276,34,24,5,220,3802,4542,8572,8573,8574,130,1352,795,177,8575,8576,8577,8578
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,144,144,384,138,1045
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,80,56,355,827,15,20,96,402,1527,458,131,3256,390,62,234,704,72,1046,56,131,44,88,250,1198,284,86,68,8579
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,266,3803,1130,377,38,1901,46,52,12,182,3,183,21,1095,208,5797,5799,2102,920,8580,796,1528,208,2299,4544,2299,920,204,2299,4544,2299,920,165,117,920,4545,274,4544,3,1004
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,85,209,568,80,29,3,968,3,968,5,3804,4546


In [551]:
X_test.shape

(1500, 100)

# Build the Graph

In [631]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [632]:
model.add(tf.keras.layers.Embedding(top_words + 1, #Vocablury size
                                    50, #Embedding size
                                    input_length=max_review_length) #Number of words in each review
          )

In [633]:
model.output

<KerasTensor: shape=(None, 100, 50) dtype=float32 (created by layer 'embedding')>

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [634]:
model.add(tf.keras.layers.Dropout(0.4))
#model.add(tf.keras.layers.BatchNormalization())

In [635]:
model.add(tf.keras.layers.LSTM(256,return_sequences=True,  activation="tanh", dropout=0.4))
model.add(tf.keras.layers.GRU(256,return_sequences=True,  activation="tanh", dropout=0.4))
model.add(tf.keras.layers.GRU(256,return_sequences=True, activation="tanh", dropout=0.4))
model.add(tf.keras.layers.GRU(256,return_sequences=True, activation="tanh", dropout=0.4))
model.add(tf.keras.layers.GRU(256, activation="tanh", dropout=0.4))

In [636]:
model.output

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'gru_3')>

Use Dense layer for output layer

In [637]:
model.add(tf.keras.layers.Dropout(0.4))
#model.add(tf.keras.layers.BatchNormalization())

In [638]:
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [639]:
#Compile the model
adam_op = tf.keras.optimizers.Adam(learning_rate=0.002)
model.compile(optimizer=adam_op,loss='binary_crossentropy',metrics=['accuracy',tf.keras.metrics.Recall()])

In [640]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           500050    
_________________________________________________________________
dropout (Dropout)            (None, 100, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100, 256)          314368    
_________________________________________________________________
gru (GRU)                    (None, 100, 256)          394752    
_________________________________________________________________
gru_1 (GRU)                  (None, 100, 256)          394752    
_________________________________________________________________
gru_2 (GRU)                  (None, 100, 256)          394752    
_________________________________________________________________
gru_3 (GRU)                  (None, 256)               3

# Execute the graph

In [641]:
model.fit(X_train,y_train,
          epochs=20,
          batch_size=64,          
          validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f360106e2d0>