In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

In [12]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge/test.csv


In [13]:
train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge/train.csv")
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge/test.csv")

In [14]:
train_df.drop(['id'],axis=1,inplace=True)

# Pre Processing

In [15]:
from tensorflow.keras.layers import TextVectorization

In [16]:
X = train_df['comment_text']
y = train_df[train_df.columns[1:]].values

In [17]:
MAX_FEATURES=200000 #number of words in vocab
type(X.values)

numpy.ndarray

In [18]:
# vectorizer=TextVectorization(max_tokens=MAX_FEATURES,
#                              output_sequence_length=1800,
#                              output_mode='int')
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [19]:
X.shape
y.shape

(159571, 6)

In [20]:
vectorizer.adapt(X.values)

In [21]:
vectorized_text=vectorizer(X.values)

In [22]:
#MCSHBAP - map,chache , shuffle ,batch , prefetch from tensor slices
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset=dataset.cache()
dataset=dataset.shuffle(1600000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8) #to prevent bottleneck

In [23]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [24]:
batch_y.shape

(16, 6)

In [25]:
train=dataset.take(int(len(dataset)*0.7))
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# MODEL

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout,Bidirectional,Dense,Embedding

In [27]:
model=Sequential()
#create the embedding layer
model.add(Embedding(MAX_FEATURES+1,32))
#implement bidirectional LSTM layer
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(6,activation='sigmoid'))

In [28]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 6)                 1542      
                                                                 
Total params: 6,459,558
Trainable params: 6,459,558
Non-trainable params: 0
______________________________________________

In [None]:
hostory=model.fit(train,epochs=3,validation_data=val)

In [31]:
input_text=vectorizer('Am comming to hunt you.')
# model.save('toxicity.h5')
model=tf.keras.models.load_model('/kaggle/input/model/tox/toxicity.h5')

In [32]:
res=model.predict(np.expand_dims(input_text,0))



In [33]:
res

array([[0.7184137 , 0.01146405, 0.11608265, 0.04152801, 0.3167597 ,
        0.07679916]], dtype=float32)

In [34]:
train_df.columns[2:]

Index(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], dtype='object')

In [45]:
from matplotlib import pyplot as plt
plt.figure(figsize=(7,8))
pd.DataFrame(history.history).plot()
plt.show()
print('ok')

NameError: name 'history' is not defined

<Figure size 700x800 with 0 Axes>

# **Evaluate Model**

In [35]:
from tensorflow.keras.metrics import Precision, Recall,CategoricalAccuracy

In [36]:
pre=Precision()
re=Recall()
acc=CategoricalAccuracy()

In [37]:
for batch in test.as_numpy_iterator():
    X_true,y_true=batch
    yhat=model.predict(X_true)
    y_true=y_true.flatten()
    yhat=yhat.flatten()
    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)
    



In [39]:
print(f'precision:{pre.result().numpy()},recall:{re.result().numpy()},accuracy:{acc.result().numpy()}')

precision:0.8290030360221863,recall:0.7688428163528442,accuracy:0.5065195560455322


**Gradio**

In [40]:
!pip install gradio jinja2

[0m

In [41]:
import gradio as gr

In [64]:
def score_comment(comment):
    vectorized_comment=vectorizer(comment)
    res=model.predict(np.expand_dims(vectorized_comment,0))
    tes=''
    for idx,col in enumerate(train_df.columns[2:]):
        tes +='{}:{}\n'.format(col,res[0][idx]>0.5)   
    return tes    

In [65]:
interface=gr.Interface(fn=score_comment,inputs=gr.inputs.Textbox(lines=2,placeholder='comment to score'),outputs='text')

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",


In [66]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7866
Running on public URL: https://1465ec0aedd772ca78.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




