## 0. Install Dependencies and Bring in Data

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv'))

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.iloc[7]['comment_text']

In [None]:
df[df.columns[2:]].iloc[5]

In [None]:
df[df['toxic'] == 1].head()

## 1. Preprocess

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
# No. of words in the vocabulary.
MAX_FEATURES = 200000

In [None]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES,
                              output_sequence_length = 1800,
                              output_mode = 'int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer("Hello world, How are you?")[:5]

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text

In [None]:
#MCSHBAP - map, cache, shuffle, batch, prefetch from tensor_slices and list_files
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache() # Caches the data
dataset = dataset.shuffle(160000) # Shuffles the data, 160000 is buffer size.
dataset = dataset.batch(16) # Each batch will be represented as a series of 16 samples.
dataset = dataset.prefetch(8) # Helps prevents bottlenecks(conjusted traffic flow).

In [None]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [None]:
batch_X

In [None]:
batch_X.shape

In [None]:
batch_y

In [None]:
batch_y.shape

In [None]:
int(len(dataset)*.7)

In [None]:
train = dataset.take(int(len(dataset)*.7)) # 70% of the length of the dataset(batches) taken in training part.
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # Skip the 70% of the dataset and then take 20% from remaining dataset.
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # Same as above.

In [None]:
len(train), len(val), len(test)

In [None]:
train_generator = train.as_numpy_iterator()

In [None]:
train_generator.next() # Run it again and again, you will se it changing, as it moves from one batch to other.

## 2. Create Sequential Model

In [None]:
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential

In [None]:
model = Sequential()
# Create the embedding layer.
model.add(Embedding(MAX_FEATURES + 1, 32))
# Create LSTM with 32 different units with activation function as tanh
model.add(Bidirectional(LSTM(32, activation = 'tanh')))
# Feature extractor fully connected layers.
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
# Final layer. Maps to the different outputs inside of our neural network
model.add(Dense(6, activation = 'sigmoid'))

In [None]:
model.compile(loss = 'BinaryCrossentropy', optimizer = 'Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs = 15, validation_data = val)

## 3. Make Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hurt you' )

In [None]:
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
np.expand_dims(input_text, 0)

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

## 5. Evaluate Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch.
    X_true, y_true = batch
    # Make a prediction.
    yhat = model.predict(X_true)
    
    # Flatten the predictions.
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()},Recall: {re.result().numpy()},Accuracy: {acc.result().numpy()}')

## 5. Test and Gradio

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('htccolab.h5')

In [None]:
model = tf.keras.models.load_model('htccolab.h5')

In [None]:
input_str = vectorizer('Hey I freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str, 0))

In [None]:
res > 0.5

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx] > 0.5)
        
    return text

In [None]:
interface = gr.Interface(fn = score_comment, 
                         inputs = gr.inputs.Textbox(lines = 2, placeholder = 'Comment to score'),
                         outputs = 'text')

In [None]:
interface.launch(share = True)