# Setup

## Download Pretrained Embeddings

In [None]:
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip

In [None]:
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip

In [None]:
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip

In [None]:
!unzip glove.840B.300d.zip

In [None]:
!unzip glove.6B.zip

In [None]:
!unzip glove.twitter.27B.zip

## Python Imports

In [None]:
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Embedding, Flatten, Dense

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Global Funtions

In [None]:
def load_glove_vectors(glove_file):
    embeddings = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [None]:
def plot_history(history):
    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.tight_layout()
    plt.show()


In [None]:
def fill_embedding_matrix(embeddings, word_index, nb_words, embedding_dim):
    # Initialize the embedding matrix as a zeros matrix
    embedding_matrix = np.zeros((nb_words, embedding_dim))

    # Fill the embedding matrix
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

# Prepare Dataset

In [None]:
# Load the train dataset
train_df = pd.read_csv('dataset/train.csv')

# Load the test dataset
test_df = pd.read_csv('dataset/test.csv')

# Load the test labels
test_labels_df = pd.read_csv('dataset/test_labels.csv')

# Filter out rows in test_labels where all label columns are -1
mask = (test_labels_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] == -1).all(axis=1)
test_labels_df = test_labels_df[~mask]

# Now join test_df and test_labels_df on 'id' to get the final test dataset
test_df = pd.merge(test_df, test_labels_df, on='id')

In [None]:
# Access the 'comment_text' column in the train dataset
train_text = train_df['comment_text'].astype(str)

# Access the 'comment_text' column in the test dataset
test_text = test_df['comment_text'].astype(str)

# Access the labels in the train dataset
train_labels = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Access the labels in the test dataset
test_labels = test_labels_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [None]:
tokenizer = Tokenizer()

# Fit the tokenizer on the text data (both train and test)
# This will create the vocabulary index based on word frequency
tokenizer.fit_on_texts(pd.concat([train_text, test_text]))

# Transform each text in train_text to a sequence of integers with its corresponding integer value from the word_index dictionary
train_sequences = tokenizer.texts_to_sequences(train_text)

# Transform each text in test_text to a sequence of integers with its corresponding integer value from the word_index dictionary
test_sequences = tokenizer.texts_to_sequences(test_text)

# Transform each sequence in train_sequences to the same length
# This is done by padding sequences that are shorter than the longest sequence, and truncating sequences that are longer
train_data = pad_sequences(train_sequences)

# Transform each sequence in test_sequences to the same length as the sequences in train_data
# This is done by padding or truncating as necessary
test_data = pad_sequences(test_sequences, maxlen=train_data.shape[1])

# Glove

In [None]:
embeddings = load_glove_vectors("glove.6B.300d.txt")
embedding_dim = 300 

word_index = tokenizer.word_index
nb_words = len(tokenizer.word_index) + 1

embedding_matrix = fill_embedding_matrix(embeddings, word_index, nb_words, embedding_dim)

## Pre-trained

In [None]:
model = Sequential()
model.add(Embedding(nb_words, embedding_dim, weights=[embedding_matrix]))
model.layers[0].trainable = False

model.add(Flatten())
model.add(Dense(6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(train_data, train_labels, epochs=2, batch_size=300, validation_split=0.1)

loss, accuracy = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {accuracy}')

plot_history(history)

model.save('glove-6B-300d-Pre-trained.keras')

# # Load the model
# loaded_model = load_model('path_to_my_model')

## Re-trained

In [None]:
model = Sequential()
model.add(Embedding(nb_words, embedding_dim, weights=[embedding_matrix]))

model.add(Flatten())
model.add(Dense(6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(train_data, train_labels, epochs=2, batch_size=300, validation_split=0.1)

loss, accuracy = model.evaluate(test_data, test_labels)
print(f'Test accuracy: {accuracy}')

plot_history(history)

model.save('glove-6B-300d-Re-trained.keras')

# # Load the model
# loaded_model = load_model('path_to_my_model')