#0. Install Dependencies and Bring in Data

In [1]:
!pip install tensorflow pandas matplotlib scikit-learn



In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [4]:
df = pd.read_csv(os.path.join('/content/drive/MyDrive/ColabNotebooks/commenttoxic/train.csv'))

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#1. Preprocess

In [6]:
from tensorflow.keras.layers import TextVectorization

In [None]:
TextVectorization?? #A preprocessing layer which maps text features to integer sequences.

In [11]:
X = df['comment_text']

In [12]:
y = df[df.columns[2:]].values

In [13]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [14]:
MAX_FEATURES = 200000 # number of words in the vocab (dictionary) you can change this number(if model is too big could run out of RAM)

In [15]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800, #max length of sentence
                               output_mode='int') # map each word to integer

In [16]:
vectorizer.adapt(X.values) #teach vectorizer our vocablury, adapt learns every word in X (our vocablury)

In [17]:
vectorizer.get_vocabulary # shows mapping of vacabulary

In [18]:
vectorizer('Hello there, how are you?')[:5] #example

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([288,  41,  73,  20,   7])>

In [19]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text  #shape=(input size,output_sequence_length )

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [20]:
#now create a tensorflow data pipeline - useful when you got data that cant feed into memory
#MCSHBAP - map, chache, shuffle, batch, prefetch |  from_tensor_slices, list_file | this is basic data pipeline generation set
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y)) #this line create dataset, other 4 lines data preprocessing
dataset = dataset.cache() # Caching allows for faster data loading by storing elements in memory after the first epoch.
dataset = dataset.shuffle(160000) #shuffles the elements of the dataset. buffer size for shuffle -160000,   number of elements from the dataset that the shuffle operation uses to randomly sample and produce a shuffled result.
dataset = dataset.batch(16) # each batch has 16
dataset = dataset.prefetch(8) # helps bottlenecks,  enables the dataset to asynchronously fetch batches while the model is training on the current batch, reducing the time spent waiting for data.

In [21]:
batchX, batchy = dataset.as_numpy_iterator().next()  #this shows us one batch, next moves to the next batch
#as_numpy_iterator() is a method in TensorFlow that converts a dataset into an iterator that produces NumPy arrays. It's often used when you want to iterate over the elements of a TensorFlow dataset and obtain the data in NumPy format.

In [None]:
len(batchX)

16

In [None]:
batchX.shape

(16, 1800)

In [31]:
train = dataset.take(int(len(dataset)*.7))  #70%
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))  #20%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))  #10%

In [None]:
len(dataset)  #9974 batches

9974

In [None]:
len(train)

6981

In [None]:
len(test)

997

In [None]:
len(val)

1994

In [None]:
train.as_numpy_iterator().next()

(array([[ 3195,   111,    14, ...,     0,     0,     0],
        [    2, 19683, 33825, ...,     0,     0,     0],
        [ 6963, 58621,     2, ...,     0,     0,     0],
        ...,
        [  232, 11640,   581, ...,     0,     0,     0],
        [54386,    19, 29607, ...,     0,     0,     0],
        [  248,   111,     4, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

#2. Create Sequential Model

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

Using an embedding vector in natural language processing (NLP) or other similar tasks offers several advantages:

1. **Dimensionality Reduction:** Embedding vectors reduce the dimensionality of the input space, making it more computationally efficient to process and analyze large datasets.

2. **Semantic Representation:** Embedding vectors capture semantic relationships between words or entities. Words with similar meanings are often represented by similar vectors in the embedding space, allowing models to learn semantic similarities.

3. **Generalization:** Embedding vectors generalize well to unseen words or entities. By learning representations from large corpora of text, embeddings can capture the underlying structure of the language and apply it to new data.

4. **Feature Learning:** Embedding vectors automatically learn meaningful features from the input data. This eliminates the need for manual feature engineering and allows models to focus on higher-level tasks.

5. **Improved Model Performance:** Models trained with embedding vectors often achieve better performance compared to models using raw input data. Embeddings provide rich representations that capture nuanced relationships, leading to more accurate predictions.

6. **Efficient Training:** Embedding layers are trainable parameters in neural networks, allowing models to update embeddings during training to better fit the data. This enables the model to adapt to the specific task at hand and improve over time.

Overall, embedding vectors serve as powerful tools for representing textual data in a way that is meaningful, efficient, and conducive to effective machine learning.

In [23]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32)) #32-size of the embedding vector for each word.
# Bidirectional LSTM Layer - bidirectional pass info both ways, previous words also important
model.add(Bidirectional(LSTM(32, activation='tanh'))) #lastm layer has 32 different lstm units #tanh because GPU acceleration required for lstm layer needs to be tanh
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid')) #sigmoid - output between 0 and 1

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam') #becaue 6 different binary classifiers at one time - not one '1' o/p, all six can be '1's

In [None]:
model.summary()  #64 because bidirectional

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train, epochs=1, validation_data=val)



In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/commenttoxic/toxic_model.h5')

  saving_api.save_model(


In [25]:
from tensorflow.keras.models import load_model

loaded_model = load_model('/content/drive/MyDrive/ColabNotebooks/commenttoxic/toxic_model.h5')

In [None]:
history.history

{'loss': [0.0613848902285099], 'val_loss': [0.04592128470540047]}

#3. Make Predictions


In [26]:
input_text = vectorizer('You freaking suck! I am going to hit you.')  #before feed into the model need to encode it

In [27]:
input_text #check the shape

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([   7, 7158,  397, ...,    0,    0,    0])>

In [None]:
res = model.predict(input_text) #input is not as a batch, input shape is wrong

In [28]:
res = loaded_model.predict(np.array([input_text])) #converting text into numpy array and predict



In [None]:
res

array([[0.98900473, 0.15501066, 0.8992594 , 0.02974626, 0.80624086,
        0.15117283]], dtype=float32)

In [None]:
np.array([input_text]).shape

(1, 1800)

In [None]:
np.expand_dims(input_text,0).shape

(1, 1800)

In [None]:
np.expand_dims(input_text,0)

array([[   7, 7158,  397, ...,    0,    0,    0]])

In [29]:
loaded_model.predict(np.expand_dims(input_text,0))



array([[0.98900473, 0.15501066, 0.8992594 , 0.02974626, 0.80624086,
        0.15117283]], dtype=float32)

In [32]:
batchX , batchy = test.as_numpy_iterator().next()

In [None]:
loaded_model.predict(batchX)

In [34]:
(loaded_model.predict(batchX) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [35]:
batchy

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

#4.Evaluate Model

In [36]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy


In [37]:
pre = Precision() # Precision = True Positives / (True Positives + False Positives)
re = Recall() #Recall = True Positives / (True Positives + False Negatives)
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): #iterate through every single batch in our pipeline
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = loaded_model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat) #calculate for the current batch and update the overall kpi
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [39]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8231622576713562, Recall:0.6748152375221252, Accuracy:0.48946839570999146
