# Pretrained Google Word2Vec with Pure RNN (GRU)

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, GRU, Dropout, Dense
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import KeyedVectors

# Hyperparameters
MAX_FEATURES = 20000  # Max number of words in vocabulary
MAX_TEXT_LENGTH = 100  # Max length of each comment
EMBEDDING_DIM = 300  # Google Word2Vec uses 300-dimensional vectors
BATCH_SIZE = 32
EPOCHS = 4
VALIDATION_SPLIT = 0.1
CLASSES_LIST = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Load Data
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

# Load pre-trained Google News Word2Vec
word2vec_path = "./input/GoogleNews-vectors-negative300.bin"  # Update with the actual path
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Function to tokenize and pad text sequences
def get_X_train_X_test(train, test):
    train_raw_text = train["comment_text"].fillna("MISSINGVALUE").values
    test_raw_text = test["comment_text"].fillna("MISSINGVALUE").values

    tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
    tokenizer.fit_on_texts(list(train_raw_text))

    train_tokenized = tokenizer.texts_to_sequences(train_raw_text)
    test_tokenized = tokenizer.texts_to_sequences(test_raw_text)

    return (
        sequence.pad_sequences(train_tokenized, maxlen=MAX_TEXT_LENGTH),
        sequence.pad_sequences(test_tokenized, maxlen=MAX_TEXT_LENGTH),
        tokenizer.word_index
    )

# Function to get target labels
def get_Y(train):
    return train[CLASSES_LIST].values

# Function to create embedding matrix from pre-trained Word2Vec
def create_embedding_matrix(word_index, embedding_dim, word2vec_model):
    vocab_size = min(MAX_FEATURES, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if i >= MAX_FEATURES:
            continue
        if word in word2vec_model:
            embedding_matrix[i] = word2vec_model[word]

    return embedding_matrix

# Function to define the Pure RNN (GRU) Model
def get_model(embedding_matrix, vocab_size):
    inp = Input(shape=(MAX_TEXT_LENGTH,))
    main = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, 
                     weights=[embedding_matrix], trainable=False)(inp)  # Freeze pre-trained embeddings
    main = Dropout(0.2)(main)
    main = GRU(64, return_sequences=True)(main)
    main = GRU(32, return_sequences=False)(main)
    main = Dense(16, activation="relu")(main)
    main = Dense(6, activation="sigmoid")(main)  # Multi-label classification (Toxic Comments)
    
    model = Model(inputs=inp, outputs=main)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

# Function to train model and make predictions
def train_fit_predict(model, X_train, X_test, y):
    file_path = "weights_best.keras"  # Use .keras instead of .hdf5

    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

    callbacks_list = [checkpoint, early]

    model.fit(
        X_train, y, batch_size=BATCH_SIZE, epochs=EPOCHS,
        verbose=1, validation_split=VALIDATION_SPLIT, callbacks=callbacks_list
    )

    model.load_weights(file_path)  # Ensure filename matches .keras
    return model.predict(X_test)

# Function to create submission file
def submit(y_test):
    sample_submission = pd.read_csv("./input/sample_submission.csv")
    sample_submission[CLASSES_LIST] = y_test
    sample_submission.to_csv("baseline_word2vec_rnn.csv", index=False)

# Process text data
X_train, X_test, word_index = get_X_train_X_test(train, test)
y = get_Y(train)

# Create Word2Vec embedding matrix from pre-trained embeddings
embedding_matrix = create_embedding_matrix(word_index, EMBEDDING_DIM, word2vec)

# Get model
vocab_size = min(MAX_FEATURES, len(word_index) + 1)
model = get_model(embedding_matrix, vocab_size)

# Train model and predict
y_test = train_fit_predict(model, X_train, X_test, y)

# Submit predictions
submit(y_test)


Epoch 1/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.9638 - loss: 0.0862
Epoch 1: val_loss improved from inf to 0.05219, saving model to weights_best.keras
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 37ms/step - accuracy: 0.9638 - loss: 0.0862 - val_accuracy: 0.9939 - val_loss: 0.0522
Epoch 2/4
[1m4487/4488[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 35ms/step - accuracy: 0.9858 - loss: 0.0492
Epoch 2: val_loss improved from 0.05219 to 0.04918, saving model to weights_best.keras
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 36ms/step - accuracy: 0.9858 - loss: 0.0492 - val_accuracy: 0.9450 - val_loss: 0.0492
Epoch 3/4
[1m4487/4488[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 35ms/step - accuracy: 0.9826 - loss: 0.0457
Epoch 3: val_loss did not improve from 0.04918
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 36ms/step - accuracy: 0.9826 - loss

# Non-pretrained Word2Vec with Pure RNN (GRU)

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, GRU, Dropout, Dense
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec

# Hyperparameters
MAX_FEATURES = 20000  
MAX_TEXT_LENGTH = 100  
EMBEDDING_DIM = 100  
BATCH_SIZE = 32
EPOCHS = 4
VALIDATION_SPLIT = 0.1
CLASSES_LIST = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Load Data
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

# Train Word2Vec from scratch
sentences = train["comment_text"].fillna("MISSINGVALUE").apply(lambda x: x.split()).tolist()
word2vec_model = Word2Vec(sentences, vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)

# Tokenization
def get_X_train_X_test(train, test):
    tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
    tokenizer.fit_on_texts(train["comment_text"].fillna("MISSINGVALUE"))

    train_seq = tokenizer.texts_to_sequences(train["comment_text"].fillna("MISSINGVALUE"))
    test_seq = tokenizer.texts_to_sequences(test["comment_text"].fillna("MISSINGVALUE"))

    return sequence.pad_sequences(train_seq, maxlen=MAX_TEXT_LENGTH), sequence.pad_sequences(test_seq, maxlen=MAX_TEXT_LENGTH), tokenizer.word_index

# Convert text to vectors
X_train, X_test, word_index = get_X_train_X_test(train, test)
y_train = train[CLASSES_LIST].values

# Create Word2Vec embedding matrix
def create_embedding_matrix(word_index, word2vec_model):
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word_index, word2vec_model)

# Define Pure RNN Model (GRU)
def get_rnn_model(embedding_matrix):
    inp = Input(shape=(MAX_TEXT_LENGTH,))
    x = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], trainable=True)(inp)
    x = Dropout(0.2)(x)
    x = GRU(64, return_sequences=True)(x)
    x = GRU(32, return_sequences=False)(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

# Train Model
model = get_rnn_model(embedding_matrix)
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT, callbacks=[EarlyStopping(monitor="val_loss", patience=3)])


Epoch 1/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m730s[0m 162ms/step - accuracy: 0.9037 - loss: 0.0946 - val_accuracy: 0.9935 - val_loss: 0.0479
Epoch 2/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 165ms/step - accuracy: 0.9936 - loss: 0.0437 - val_accuracy: 0.9931 - val_loss: 0.0466
Epoch 3/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m736s[0m 164ms/step - accuracy: 0.9911 - loss: 0.0390 - val_accuracy: 0.9882 - val_loss: 0.0473
Epoch 4/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m740s[0m 165ms/step - accuracy: 0.9867 - loss: 0.0346 - val_accuracy: 0.9930 - val_loss: 0.0489


<keras.src.callbacks.history.History at 0x2c959f2f700>

# Non-pretrained Word2Vec with Pure CNN

In [8]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

# Define Pure CNN Model
def get_cnn_model(embedding_matrix):
    inp = Input(shape=(MAX_TEXT_LENGTH,))
    x = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], trainable=True)(inp)
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Flatten()(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

# Train Model
model = get_cnn_model(embedding_matrix)
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT, callbacks=[EarlyStopping(monitor="val_loss", patience=3)])


Epoch 1/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m582s[0m 130ms/step - accuracy: 0.8321 - loss: 0.1038 - val_accuracy: 0.9917 - val_loss: 0.0552
Epoch 2/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m572s[0m 128ms/step - accuracy: 0.9733 - loss: 0.0497 - val_accuracy: 0.9924 - val_loss: 0.0523
Epoch 3/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 128ms/step - accuracy: 0.9789 - loss: 0.0442 - val_accuracy: 0.9772 - val_loss: 0.0539
Epoch 4/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m574s[0m 128ms/step - accuracy: 0.9548 - loss: 0.0405 - val_accuracy: 0.9917 - val_loss: 0.0538


<keras.src.callbacks.history.History at 0x2c9d430c5e0>

# Non-pretrained Word2Vec with Combination RNN+CNN

In [9]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional

# Define Hybrid CNN + RNN Model
def get_cnn_rnn_model(embedding_matrix):
    inp = Input(shape=(MAX_TEXT_LENGTH,))
    x = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], trainable=True)(inp)
    
    # CNN Feature Extraction
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    
    # RNN for Sequential Learning
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    x = GRU(32, return_sequences=False)(x)
    
    x = Dense(16, activation="relu")(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

# Train Model
model = get_cnn_rnn_model(embedding_matrix)
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT, callbacks=[EarlyStopping(monitor="val_loss", patience=3)])


Epoch 1/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 146ms/step - accuracy: 0.9613 - loss: 0.0844 - val_accuracy: 0.9940 - val_loss: 0.0534
Epoch 2/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m655s[0m 146ms/step - accuracy: 0.9943 - loss: 0.0469 - val_accuracy: 0.9940 - val_loss: 0.0507
Epoch 3/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m662s[0m 148ms/step - accuracy: 0.9940 - loss: 0.0424 - val_accuracy: 0.9935 - val_loss: 0.0499
Epoch 4/4
[1m4488/4488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m680s[0m 151ms/step - accuracy: 0.9937 - loss: 0.0390 - val_accuracy: 0.9934 - val_loss: 0.0497


<keras.src.callbacks.history.History at 0x2c94f7d3430>