### Import

In [54]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

### Preprocessing Data

In [55]:
df = pd.read_csv('./data/train.csv')

In [56]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [57]:
from tensorflow.keras.layers import TextVectorization #=> word to number

In [58]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [59]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [60]:
y 

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [61]:
MAX_WORD_IN_VOCABULARY = 200000 # number of words in vocabulary

In [62]:
vectorizer = TextVectorization(max_tokens=MAX_WORD_IN_VOCABULARY, # max number to create vocabulary
                               output_sequence_length=1800,     # vector output length
                               output_mode='int')       # type of number

In [63]:
vectorizer.adapt(X.values) # Build vocabulary base on X values

In [64]:
vectorizer('hi Linh').shape

TensorShape([1800])

In [65]:
vectorized_text = vectorizer(X.values)

In [66]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [67]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()       # cache dataset to speed up training
dataset = dataset.shuffle(160000) # shuffle to avoid overfitting
dataset = dataset.batch(16)     # split dataset to smaller batch (16)
dataset = dataset.prefetch(8) # Load the next batch for training

In [68]:
dataset.as_numpy_iterator().next()

(array([[   451, 128317,   2647, ...,      0,      0,      0],
        [    21,    231,      3, ...,      0,      0,      0],
        [   197,     76,     74, ...,      0,      0,      0],
        ...,
        [    46,    115,     16, ...,      0,      0,      0],
        [   478,      3,  27698, ...,      0,      0,      0],
        [  4860,     14,   6702, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [69]:
batch_X, batch_y = dataset.as_numpy_iterator().next()
batch_X.shape, batch_y.shape

((16, 1800), (16, 6))

In [70]:
len(dataset)

9974

In [71]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [72]:
len(train), len(val), len(test)


(6981, 1994, 997)

### Create model

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [74]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_WORD_IN_VOCABULARY+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [75]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam') # binary (0,1)

In [76]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

In [81]:
from tensorflow.keras.models import load_model

my_model = load_model('model.h5')


### Prediction

In [116]:

input_text = vectorizer('Fuck you')

In [99]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([1109,   81,    8, ...,    0,    0,    0], dtype=int64)>

In [104]:
np.array([input_text])

array([[1109,   81,    8, ...,    0,    0,    0]], dtype=int64)

In [117]:
res = my_model.predict(np.array([input_text]))



In [118]:
res

array([[0.9978797 , 0.33469996, 0.96753347, 0.05115277, 0.87422645,
        0.2266451 ]], dtype=float32)

In [119]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [120]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [123]:
(my_model.predict(batch_X) > 0.5).astype(int)




array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [124]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

### Evaluation

In [125]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [126]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [127]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = my_model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [128]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8269026279449463, Recall:0.6944113969802856, Accuracy:0.4663991928100586


### Interface

In [141]:
import tensorflow as tf
import gradio as gr

In [142]:
input_str = vectorizer('hey i freaken hate you!')

In [143]:
res = my_model.predict(np.expand_dims(input_str,0))



In [144]:
res

array([[0.865442  , 0.03568728, 0.4453451 , 0.02464375, 0.44171998,
        0.08360171]], dtype=float32)

In [145]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = my_model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [146]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [147]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://12d2cecd-6efd-47b9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




