In [None]:
!pip install tensorflow pandas matplotlib scikit-learn

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

# Data Preprocessing

In [2]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv','train.csv'))

In [3]:
from tensorflow.keras.layers import TextVectorization

In [4]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [5]:
MAX_FEATURES = 200000 # number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [9]:
vectorized_text = vectorizer(X.values)

In [10]:
#Data preprocess pipeline tensorflow
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y)) #initialize data set with input output labels in tensor slices
dataset = dataset.cache()
dataset = dataset.shuffle(160000) #how many examples to take from dataset, shuffle, and return as the dataset
dataset = dataset.batch(16) #creates sample batches of 16 examples
dataset = dataset.prefetch(8) #helps bottlenecks

In [11]:
batch_X, batch_y = dataset.as_numpy_iterator().next() #iterates through batches

In [12]:
len(dataset) #number of batches

9974

In [13]:
#creates train validation test split
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Model Creation

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
#Embedding Layer
model.add(Embedding(MAX_FEATURES+1,32))
#Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32,activation='tanh')))
#Feature Extractor Fully Connected Layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
#Final Output Layer
model.add(Dense(6,activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [None]:
model.summary()

In [None]:
training = model.fit(train,epochs=10,validation_data=val)

In [None]:
from matplotlib import pyplot as plt

In [None]:
#loss across epochs
plt.figure(figsize=(8,5))
pd.DataFrame(training.training).plot()
plt.show()

In [None]:
model.save('comment_toxicity.h5')

# Model Evaluation

In [14]:
model = tf.keras.models.load_model('comment_toxicity.h5')

In [15]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [16]:
pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  #unpack the batch
  X, y_true = batch
  #make predictions on batches
  yhat = model.predict(X)

  #flatten predictions to 1d array with all values
  y_true = y_true.flatten()
  yhat = yhat.flatten()

  #update the metrics
  pre.update_state(y_true, yhat)
  rec.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

In [18]:
print(f'Precision: {pre.result().numpy()}, Recall: {rec.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.9336901307106018, Recall: 0.8940774202346802, Accuracy: 0.5305917859077454


# Example Usage

In [19]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    print(text)

In [22]:
score_comment("you suck") #enter comment to test here

toxic: True
severe_toxic: False
obscene: True
threat: False
insult: True
identity_hate: False

