# Install Dependencies and load data

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [18]:
df = pd.read_csv("/content/drive/MyDrive/Comment-Toxicity/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv")

In [19]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Data Exploration

In [20]:
 df.shape

(159571, 8)

In [21]:
len(df)

159571

In [22]:
df.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

> How comments look like

In [30]:
for i in range(3):
   print(df.iloc[i]["comment_text"],print("\n"))



Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 None


D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC) None


Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info. None


In [35]:
for i in range(3):
  print(df[df.columns[2:]].iloc[1])


toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64


# Preprocessing

In [36]:
from tensorflow.keras.layers import TextVectorization

In [37]:
TextVectorization

In [44]:
x=df["comment_text"]
y=df[df.columns[2:]].values

In [45]:
x

0         Explanation\r\nWhy the edits made under my use...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\r\nMore\r\nI can't make any real suggestions...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \r\n\r\nThat...
159568    Spitzer \r\n\r\nUmm, theres no actual article ...
159569    And it looks like it was actually you who put ...
159570    "\r\nAnd ... I really don't think you understa...
Name: comment_text, Length: 159571, dtype: object

In [46]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [47]:
MAX_FEATURES=200000

### Converting our comments into vectors
> We are going to associate each word with a vector
* We are going to specify the maximum numbers of features
* Output_sequence_length tells the maximum length of sentence

In [48]:
vectorizer=TextVectorization(max_tokens=MAX_FEATURES, standardize='lower_and_strip_punctuation',output_sequence_length=1800, split='whitespace', ngrams=None, output_mode='int')

This adapt will help to learn the pattern of data
* We are going to use x.values because x.values change it to numpy array Simple x is only pd.series

In [49]:
vectorizer.adapt(x.values)

In [51]:
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is']

In [52]:
vectorized_text=vectorizer(x.values)

In [53]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [58]:
## MCSHBAP map,cache,shuffle,batch,prefix
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset= dataset.batch(16)
dataset = dataset.prefetch(8)

In [61]:
batch_x , batch_y = dataset.as_numpy_iterator().next()

In [62]:
batch_x

array([[   49,    50,    66, ...,     0,     0,     0],
       [  167,     4, 13900, ...,     0,     0,     0],
       [    7,    19,   382, ...,     0,     0,     0],
       ...,
       [  425, 92976,  1817, ...,     0,     0,     0],
       [  419,  2105,   312, ...,     0,     0,     0],
       [  804,    14,    30, ...,     0,     0,     0]])

In [63]:
batch_y

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [86]:
train = dataset.take(int(len(dataset)*.1))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.1))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [87]:
len(train),len(val),len(test)

(997, 997, 997)

In [88]:
train_generator=train.as_numpy_iterator()

In [89]:
train_generator.next()

(array([[   147,     23,      7, ...,      0,      0,      0],
        [    21,    113,    204, ...,      0,      0,      0],
        [  3166,      7,     72, ...,      0,      0,      0],
        ...,
        [     1, 100977,    179, ...,      0,      0,      0],
        [    12,    184,     15, ...,      0,      0,      0],
        [   742,     26,     15, ...,      0,      0,      0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

## Create Sequential Model

In [90]:
from tensorflow.keras.models import Sequential

In [91]:
from tensorflow.keras.layers import LSTM,Dropout,Bidirectional,Dense,Embedding

In [92]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))


In [93]:
model.compile(loss='BinaryCrossentropy',optimizer='adam')

In [94]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [None]:
model.fit(train,epochs=1,validation_data=val)



<keras.callbacks.History at 0x7fa83db75150>