<a href="https://colab.research.google.com/github/Nancy123a/Multilingual_Toxicity/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN, Dense, Activation, Dropout, Embedding, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D, BatchNormalization
from keras.utils import to_categorical
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
import csv
train = pd.read_csv('sample_data/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('sample_data/validation.csv')
test = pd.read_csv('sample_data/test.csv')

In [None]:
train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train.shape

(223549, 8)

In [None]:
test.head(5)

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr


In [None]:
validation.head(5)

Unnamed: 0,id,comment_text,lang,toxic
0,0,Este usuario ni siquiera llega al rango de ...,es,0
1,1,Il testo di questa voce pare esser scopiazzato...,it,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,es,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,tr,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,tr,0


- Drop other columns as Binary Classification problem.
- smaller subsection of the dataset(only 12000 data points) to make it easier to train the models

In [None]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [None]:
train.head(5)

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


In [None]:
train = train.loc[:12000,:]
train.shape

(12001, 3)

We will check the maximum number of words that can be present in a comment , this will help us in padding later

In [None]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

In [None]:
def roc_auc(predictions,target):
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

- xtrain and xvalid are the comment_text
- y_train and y_valid are the toxic

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values,
                                                  stratify=train.toxic.values,
                                                  random_state=42,
                                                  test_size=0.2, shuffle=True)

## Simple RNN

Recurrent Neural Network(RNN) are a type of Neural Network where the output from previous step are fed as input to the current step. In traditional neural networks, all the inputs and outputs are independent of each other, but in cases like when it is required to predict the next word of a sentence, the previous words are required and hence there is a need to remember the previous words. Thus RNN came into existence, which solved this issue with the help of a Hidden Layer.



In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
# Tokenizer will consider all unique words in the dataset.
# The Tokenizer will convert text into a sequence of integers, where each integer represents a specific word in the dataset.
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
# fit_on_texts creates a vocabulary index based on the words in xtrain and xvalid. It builds a dictionary where each word
# is assigned a unique integer based on its frequency in the text data.

xtrain_seq = token.texts_to_sequences(xtrain)
# texts_to_sequences transforms each comment into a list of integers. Each integer corresponds
# to a word in the vocabulary created by the tokenizer. For example, if "the" is the most frequent word, it might be represented by 1,
# and "cat" might be represented by 2, so the sentence "the cat" would be converted to [1, 2].
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
# Sequences can vary in length (some comments are short, others are long), but models usually require inputs of fixed length.
# pad_sequences ensures that all sequences have the same length (max_len = 1500). If a sequence is shorter than max_len, it is
# padded with zeros at the beginning or end. If it’s longer, it is truncated to fit the maximum length.
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index
# word_index is a dictionary that contains the mapping of words to their respective integer indices.
# word_index['the'] might return 1 if "the" is the most common word.

In [None]:
# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
# Sequential() is a Keras model type that stacks layers linearly, one after the other.
# This type of model is simple and suitable when your model has a single input and output
# and the layers can be arranged in sequence.

model.add(Embedding(len(word_index) + 1,300,input_length=max_len))
# Embedding: This layer turns positive integers (word indices) into dense vectors of fixed size (300-dimensional in this case).

# len(word_index) + 1: This is the size of the vocabulary plus one (the extra one is often used for padding or unknown words).
# Each word index will be converted into a 300-dimensional vector.

# input_length=max_len: Specifies the length of input sequences (1500, as specified earlier).
# This is the number of words in each sequence after padding.

model.add(SimpleRNN(100))
# This layer is a basic RNN with 100 units (hidden states). It processes the sequences of word embeddings one time step at a time
# and passes the information along to the next time step.

# The RNN captures temporal relationships in the sequence data, making it useful for tasks like text classification, where the order of words matters.

# The output of this RNN layer is a 100-dimensional vector representing the entire sequence.


model.add(Dense(1, activation='sigmoid'))
# Dense(1): A fully connected (Dense) layer with 1 unit. This layer combines the 100-dimensional output from the RNN layer into a single value.

# activation='sigmoid': The sigmoid activation function squashes the output to a value between 0 and 1.
# This is ideal for binary classification tasks because the output can be interpreted as a probability of the input being in the positive class (toxic).

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# loss='binary_crossentropy': The loss function measures how far the predicted outputs are from the true binary labels (0 or 1).
# Binary cross-entropy is appropriate for binary classification tasks.

# optimizer='adam': Adam is an adaptive optimizer that adjusts the learning rate during training for efficient convergence.

# metrics=['accuracy']: The model's performance is evaluated using accuracy, which measures the proportion of correct predictions.


model.summary()


In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)

Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 3s/step - accuracy: 0.8342 - loss: 0.4168
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 2s/step - accuracy: 0.9499 - loss: 0.1384
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 3s/step - accuracy: 0.9967 - loss: 0.0192
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 3s/step - accuracy: 0.9998 - loss: 0.0036
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 3s/step - accuracy: 1.0000 - loss: 0.0013


<keras.src.callbacks.history.History at 0x79ca48fc3af0>

## Importance of padding

When using neural networks, we usually feed an input into the network then take the output, compute the loss then back propagate, then reiterate with the next input. In practice, it is way more efficient to process data in batches, not one by one i.e. feed 64 inputs, and get 64 outputs. We’ll do it by using matrices [batch_size x sequence_length]. If we have variable length sequence, sequence_length correspond to the longest sequence. Also, in this case, we fill sequences with a pad token (usually 0) to fit the matrix size. This special tokens is then masked not to be accounted in loss calculation etc.

If it is implemented right, it shouldn’t have consequences, because such pad tokens are masked etc. Still there is some interesting side effects…

The first thing one may realize is that we should minimize the amount of padding. Pad token does not impact the model learning, but it is still processed, and therefore it is basically a waste of computational resource. Recall that we pad when sequences are of different length. Therefore, sorting our data by length is a good way to limit padding.

But here’s the trap. Neural networks does not like sorted data, especially for large dataset. Intuitively, it kind of make sense. The neural network may locally overfit to the current size of the sequence. When dataset is large, the epoch begins with only small sequences, so the network may choose to adapt to it. Then progressively adapts to longer ones. A good strategy is then to sort the data, split it into batches, then shuffle the order. Doing this will allow you to minimize padding, and provide data without particular order, so the network won’t try to specialize.

To sum up, padding does not changes anything, theoretically. In practice, it makes it possible to use batch during training, improving the performance. One still need to be careful about the batching implementation since it may create side effects.

## AUC (Area under the curve):


AUC stands for **Area Under the Curve** and is often used in the context of ROC (Receiver Operating Characteristic) curves to evaluate the performance of a binary classification model. Here's a detailed explanation:

### 1. **ROC Curve:**
The ROC curve is a graphical representation of a classifier's performance across different thresholds. It plots two metrics:
- **True Positive Rate (TPR):** Also known as Sensitivity or Recall, it is the proportion of actual positives that are correctly identified by the model.

$\text{TPR} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Negatives (FN)}}$

- **False Positive Rate (FPR):** It is the proportion of actual negatives that are incorrectly identified as positive by the model.
$\text{FPR} = \frac{\text{False Positives (FP)}}{\text{False Positives (FP)} + \text{True Negatives (TN)}}$

As the classification threshold is varied, the TPR and FPR values change, resulting in a curve when TPR is plotted against FPR.

### 2. **AUC - Area Under the Curve:**
- **AUC** refers to the area under the ROC curve. It provides a single scalar value to summarize the overall performance of the classifier.
- **Interpretation of AUC:**
  - **AUC = 1.0:** Perfect classifier. The model distinguishes between all positive and negative instances perfectly.
  - **AUC = 0.5:** No discriminative power. The model performs no better than random guessing.
  - **AUC < 0.5:** The model is worse than random guessing, which usually indicates that the model is systematically wrong.

### 3. **Why AUC is Useful:**
- **Threshold Independence:** AUC considers all possible classification thresholds, providing a more comprehensive evaluation of the model's performance than accuracy, which relies on a specific threshold.
- **Sensitivity to Imbalanced Classes:** AUC is particularly useful in situations where the classes are imbalanced (e.g., a dataset where 90% of the samples are negative and 10% are positive). Unlike accuracy, which can be misleading in such cases, AUC focuses on the model's ability to rank positive instances higher than negative ones.

### 4. **How to Use AUC:**
In practice, AUC is often used as a metric when training and evaluating models, especially in binary classification problems. Many machine learning libraries, like Scikit-learn, provide functions to calculate the ROC AUC score directly.

### 5. **AUC in Multi-Class Classification:**
For multi-class classification, AUC can be extended by calculating the AUC for each class separately (using a one-vs-rest approach) and then averaging the results.

### Summary
AUC is a powerful metric that evaluates the ability of a binary classifier to distinguish between classes across various thresholds, offering a robust measure of model performance, especially in imbalanced datasets.

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 278ms/step
Auc: 0.88%


In [None]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

We can see our model achieves an accuracy of 1 which is just insane , we are clearly overfitting I know , but this was the simplest model of all ,we can tune a lot of hyperparameters like RNN units, we can do batch normalization , dropouts etc to get better result. The point is we got an AUC score of 0.82 without much efforts and we know have learnt about RNN's .Deep learning is really revolutionary.


## Word Embeddings:
**GloVe** (Global Vectors for Word Representation) and **FastText** are two popular methods for generating word embeddings, but they have different approaches and characteristics. Here's a comparison of the two:

### **GloVe (Global Vectors for Word Representation)**

#### **Overview:**
- **GloVe** is an unsupervised learning algorithm designed to learn word embeddings by capturing global statistical information from a corpus.
- Developed by Stanford researchers, it is based on matrix factorization techniques applied to word co-occurrence matrices.

#### **Key Features:**
1. **Co-occurrence Matrix:**
   - GloVe constructs a word co-occurrence matrix from the corpus, which counts how often words appear together within a specified context window.
   - The co-occurrence matrix is then factorized to learn word vectors.

2. **Objective Function:**
   - The objective of GloVe is to find word vectors such that their dot product approximates the log of the word’s co-occurrence probability.
   - The loss function is designed to ensure that the similarity between word vectors reflects the co-occurrence statistics.

3. **Fixed Embeddings:**
   - Once trained, GloVe embeddings are fixed and do not change with new data. The model learns a set of embeddings for words based on the training corpus.

4. **Performance:**
   - GloVe embeddings are known for capturing semantic meaning and word relationships effectively. However, the quality of embeddings depends heavily on the corpus used for training.

5. **Training Speed:**
   - Training GloVe can be computationally expensive, especially with large corpora, due to the matrix factorization step.

### **FastText**

#### **Overview:**
- **FastText** is an extension of Word2Vec developed by Facebook's AI Research (FAIR) team.
- It improves upon Word2Vec by considering subword information, allowing it to generate embeddings for out-of-vocabulary (OOV) words.

#### **Key Features:**
1. **Subword Information:**
   - FastText represents each word as a bag of character n-grams (subwords). For example, the word "apple" might be represented by n-grams such as "app", "ppl", "ple", etc.
   - This approach helps capture morphological information and better handle rare or OOV words.

2. **Training Process:**
   - FastText is based on a skip-gram model, similar to Word2Vec, but incorporates subword information to improve the quality of embeddings.
   - It uses a hierarchical softmax or negative sampling for efficient training.

3. **Dynamic Embeddings:**
   - FastText embeddings can be updated with new data. The subword approach allows it to generate embeddings for previously unseen words based on their character n-grams.

4. **Performance:**
   - FastText tends to perform better on morphologically rich languages and in scenarios with many OOV words because it leverages subword information.
   - It also often yields better results on tasks requiring a finer understanding of word structure.

5. **Training Speed:**
   - FastText is generally faster to train than GloVe, especially on large datasets, due to its more efficient training algorithm.

### **Comparison Summary:**

- **Embedding Generation:**
  - **GloVe:** Uses matrix factorization on word co-occurrence matrices.
  - **FastText:** Uses subword information with a skip-gram model or CBOW (Continuous Bag of Words).

- **Handling OOV Words:**
  - **GloVe:** Does not handle OOV (Out-of-Vocabulary) words directly; embeddings are fixed once trained.
  - **FastText:** Can generate embeddings for OOV (Out-of-Vocabulary) words based on subword information.

- **Corpus Dependency:**
  - **GloVe:** Relies heavily on the quality and size of the co-occurrence matrix built from the training corpus.
  - **FastText:** Can adapt to new words and changes in the data better due to its subword approach.

- **Performance and Application:**
  - **GloVe:** Good for capturing global word relationships and semantic meaning.
  - **FastText:** Better for handling morphologically rich languages and dealing with rare or OOV words.

In summary, GloVe is useful for its ability to capture global word relationships based on word co-occurrences, while FastText enhances the representation of words by incorporating subword information, making it more robust to OOV words and morphological variations.

In [None]:
words=[]
for word in word_index.keys():
  words.append(word)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

# Load the pre-trained GloVe model from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

def get_word_embeddings(words):
    """
    Given a list of words, return their embeddings using TensorFlow Hub model.

    Parameters:
        words (list of str): List of words to get embeddings for.

    Returns:
        embeddings (dict): A dictionary with words as keys and their embeddings as values.
    """
    # Convert words to tensor
    word_tensor = tf.constant(words)

    # Get embeddings for the words
    embeddings = embed(word_tensor)

    # Convert embeddings to numpy array and create a dictionary
    embeddings_index = {}
    for word, embedding in zip(words, embeddings.numpy()):
        embeddings_index[word] = embedding

    return embeddings_index

# Example words to get embeddings for
words = words

# Get embeddings
embeddings_index = get_word_embeddings(words)

print('Found %s word vectors.' % len(embeddings_index))


Found 43496 word vectors.


## LTSM's



In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 250))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 43496/43496 [00:00<00:00, 320682.17it/s]


## Key Differences dropout and dropout_recurrent:

- dropout affects the input to the LSTM layer, including the outputs from the previous layer (such as the embedding or dense layer).
- recurrent_dropout affects the recurrent connections within the LSTM layer, specifically the connections that maintain and update the hidden state over time.

**Purpose:**

- dropout helps prevent overfitting by regularizing the input features to the LSTM.
- recurrent_dropout helps prevent overfitting by regularizing the recurrent connections and states within the LSTM.

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,250,weights=[embedding_matrix],input_length=max_len,trainable=False))
# trainable=False: This parameter indicates whether the embeddings should be updated during training.
# Here, trainable=False means that the pre-trained embeddings are fixed and will not be updated during training.

model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)

Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 3s/step - accuracy: 0.8857 - loss: 0.3481
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 3s/step - accuracy: 0.9191 - loss: 0.2245
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 3s/step - accuracy: 0.9301 - loss: 0.1985
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 3s/step - accuracy: 0.9288 - loss: 0.1966
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 3s/step - accuracy: 0.9302 - loss: 0.1903


<keras.src.callbacks.history.History at 0x79ca558a55d0>

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 804ms/step
Auc: 0.93%


In [None]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

We now see that the model is not overfitting and achieves an auc score of 0.96 which is quite commendable , also we close in on the gap between accuracy and auc . We see that in this case we used dropout and prevented overfitting the data

Long Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU) networks are both advanced types of Recurrent Neural Networks (RNNs) designed to handle sequential data. They improve upon traditional RNNs by addressing some of their limitations, such as the vanishing gradient problem.

### **1. Benefits of LSTM Over Traditional RNN**

**Traditional RNNs:**
- **Issue:** Traditional RNNs suffer from the vanishing gradient problem, where gradients used for training become very small, causing the network to learn very slowly or not at all. This makes it difficult for RNNs to capture long-term dependencies in sequences.
- **Issue:** RNNs also struggle with the exploding gradient problem, where gradients become excessively large, leading to unstable training.

**LSTM Networks:**
- **Solution to Vanishing Gradient:** LSTMs are designed with a more complex architecture that includes gates to regulate the flow of information, which helps mitigate the vanishing gradient problem. They have memory cells that can maintain information over long periods.
- **Components:**
  - **Cell State:** LSTMs have a cell state that carries long-term dependencies through the network. This state is modified by gates but is largely preserved over time.
  - **Gates:** LSTMs use three types of gates to control the information flow:
    - **Forget Gate:** Decides what information from the cell state should be discarded.
    - **Input Gate:** Determines what new information should be added to the cell state.
    - **Output Gate:** Controls what information from the cell state should be output.

**Benefits:**
- **Captures Long-Term Dependencies:** LSTMs are capable of learning long-term dependencies in sequences more effectively than traditional RNNs.
- **Better Gradient Flow:** The gating mechanisms help maintain stable gradients, making training more efficient and effective.

### **2. Benefits of GRU Over LSTM**

**Gated Recurrent Unit (GRU):**
- **Architecture:** GRUs simplify the LSTM architecture by combining the forget and input gates into a single update gate and by using a reset gate instead of a separate cell state.
- **Components:**
  - **Update Gate:** Determines how much of the past information (previous hidden state) needs to be passed along to the future.
  - **Reset Gate:** Controls how much of the past information should be ignored.

**Benefits Over LSTM:**
- **Fewer Parameters:** GRUs have fewer parameters than LSTMs because they combine the forget and input gates into a single update gate and do not use a separate cell state. This results in faster computation and less memory usage.
- **Simpler Architecture:** GRUs have a simpler structure compared to LSTMs, which can lead to faster training times and easier implementation.
- **Performance:** In many cases, GRUs perform comparably to LSTMs and can be preferred for certain tasks due to their simplicity and reduced computational overhead.

### **Summary of Benefits**

**LSTM Over RNN:**
- **Long-Term Memory:** Capable of learning long-term dependencies due to the memory cell and gating mechanisms.
- **Stable Training:** Better handling of vanishing and exploding gradient problems.

**GRU Over LSTM:**
- **Reduced Complexity:** Fewer gates and parameters result in a simpler and faster model.
- **Comparable Performance:** Often achieves similar performance to LSTMs with less computational cost.

### **When to Use Which:**

- **Use LSTM:** When your model needs to capture very long-term dependencies and you have sufficient computational resources.
- **Use GRU:** When you need a simpler model with fewer parameters and faster training time, especially if performance is comparable to LSTM for your specific task.

In practice, choosing between LSTM and GRU often depends on empirical testing and specific use cases. Both have proven effective in various applications, and the best choice may vary depending on the nature of your data and task.

## **GRU** ( Gated Recurrent Network):


In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,250,weights=[embedding_matrix],input_length=max_len,trainable=False))
model.add(SpatialDropout1D(rate=0.3))
# rate parameter: Specifies the fraction of the input units to drop.
model.add(GRU(300))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()



In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)

Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1469s[0m 10s/step - accuracy: 0.8843 - loss: 0.3226
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1480s[0m 9s/step - accuracy: 0.9324 - loss: 0.1953
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1827s[0m 12s/step - accuracy: 0.9212 - loss: 0.2442
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1819s[0m 12s/step - accuracy: 0.9380 - loss: 0.1699
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1632s[0m 11s/step - accuracy: 0.9480 - loss: 0.1500


<keras.src.callbacks.history.History at 0x79ca55179d20>

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 3s/step
Auc: 0.97%


In [None]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

## **Bidirectional RNN:**

 - Type of Recurrent Neural Network (RNN) where the input sequence is processed in both forward and backward directions. This allows the model to capture context from both past and future state.
 - It process in 2 directions:
    1. Forward Pass: Processes the sequence from the first to the last timestep.
    2. Backward Pass: Processes the sequence from the last to the first timestep.
 - Adventages of Bidirectional RNN:
   1. Improved Context Understanding: understand context of words in sentence by considering both previous and next words.
   2. Enhanced Performance: used in text classification, speech recognition, machine translation.


## **Seq2Seq Module:**
is a neural network architecture designed for tasks where input sequences are transformed into output sequences. It is used for machine translation, text summarization, and chatbot responses.

- How it works:
  1. Encoder: The encoder RNN processes the input sequence and compresses it into a fixed-length context vector (also called thought vector or hidden state), which summarizes the information from the input sequence.
  2. Decoder: The decoder RNN uses this context vector to generate the output sequence, one step at a time.
- Adventages of Seq2Seq:
  1. Versatility: Effective for various sequence-to-sequence tasks like translation, summarization, and question answering.
  2. Attention Mechanism: Enhances the model's ability to handle long sequences by focusing on relevant parts of the input.

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,250,weights=[embedding_matrix],input_length=max_len,trainable=False))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()



In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)

Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2583s[0m 17s/step - accuracy: 0.8981 - loss: 0.3331
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2578s[0m 17s/step - accuracy: 0.9185 - loss: 0.2353
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2601s[0m 17s/step - accuracy: 0.9213 - loss: 0.2134
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2615s[0m 17s/step - accuracy: 0.9289 - loss: 0.1998
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2582s[0m 17s/step - accuracy: 0.9378 - loss: 0.1749


<keras.src.callbacks.history.History at 0x7eff786a4340>

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 4s/step
Auc: 0.94%


## BERT

In [4]:
# Loading Dependencies
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers

from tokenizers import BertWordPieceTokenizer

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)

In [6]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 192

## Tokenization:

- **distilbert-base-multilingual-cased:** This model is designed to handle multiple languages and respects the case of the text (i.e., it distinguishes between uppercase and lowercase).
- **BertWordPieceTokenizer('vocab.txt', lowercase=False)** This line initializes a tokenizer using the tokenizers library, which is a fast tokenizer library developed by Hugging Face. Specifically, it loads a BertWordPieceTokenizer using the vocab.txt file that was saved earlier.
- **BertWordPieceTokenizer** is a tokenizer based on the WordPiece algorithm, which is used in BERT models. It allows for efficient tokenization, particularly for tasks like subword tokenization where words are broken down into smaller components.

In [7]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [8]:
x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(validation.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = validation.toxic.values

100%|██████████| 874/874 [01:03<00:00, 13.71it/s]
100%|██████████| 32/32 [00:01<00:00, 18.26it/s]
100%|██████████| 250/250 [00:17<00:00, 14.51it/s]


In [9]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

## Starting Training
If you want to use any another model just replace the model name in transformers._ and use accordingly

In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from transformers import TFDistilBertModel

def build_model(transformer, max_len=512):
    """
    Function for training the BERT model.
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")

    # Define the Lambda layer and specify the output shape
    sequence_output = Lambda(lambda x: transformer(x)[0],
                             output_shape=(max_len, transformer.config.hidden_size))(input_word_ids)

    cls_token = sequence_output[:, 0, :]  # Extract the [CLS] token's output

    out = Dense(1, activation='sigmoid')(cls_token)

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

# Load the transformer model
transformer_layer = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

# Set the maximum length for the input sequence
MAX_LEN = 192

# Build the model
model = build_model(transformer_layer, max_len=MAX_LEN)

# Print the model summary
model.summary()


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [11]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/3
[1m13971/13971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1454s[0m 103ms/step - accuracy: 0.8141 - loss: 0.4254 - val_accuracy: 0.8462 - val_loss: 0.4401
Epoch 2/3
[1m13971/13971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1439s[0m 103ms/step - accuracy: 0.9037 - loss: 0.2940 - val_accuracy: 0.8462 - val_loss: 0.4269
Epoch 3/3
[1m13971/13971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1438s[0m 103ms/step - accuracy: 0.9031 - loss: 0.2829 - val_accuracy: 0.8465 - val_loss: 0.4180
