# Import

In [None]:
import os
import re
import string
import pickle
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense ,GRU ,SimpleRNN,Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

# Data Loading & Preparation


In [None]:

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

train_df = pd.read_csv('train_prepro.csv') 
test_df = pd.read_csv('test_prepro.csv')  

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Tokenization parameters
max_words = 10000        # Maximum number of words to consider
max_len = 100            # Maximum sequence length 
embedding_dim = 100      # Dimensionality of GloVe embeddings

# Convert training text to sequences and pad them
sequences_train = tokenizer.texts_to_sequences(train_df['clean_tweet'])
X_train = pad_sequences(sequences_train, maxlen=max_len)
y_train = train_df['class'].values  

# Convert test text to sequences and pad them
sequences_test = tokenizer.texts_to_sequences(test_df['clean_tweet'])
X_test = pad_sequences(sequences_test, maxlen=max_len)

# Determine vocabulary size
num_words = min(max_words, len(tokenizer.word_index) + 1)

embedding_matrix = np.load('embedding_matrix.npy')

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training data shape: (19826, 100)
Test data shape: (4957, 100)


# Model Building Functions

LSTM Model – Building, Training, and Saving

In [None]:

num_words = 10000       
embedding_dim = 100    
max_len = 100           
def create_finetune_lstm_model():
    model = Sequential()
    # Set trainable=True for fine-tuning the embeddings
    model.add(Embedding(input_dim=num_words,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(LSTM(128, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(3, activation='softmax'))
    
    # Lower learning rate for fine-tuning
    optimizer = Adam(learning_rate=1e-3)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

model_ft = create_finetune_lstm_model()
model_ft.build(input_shape=(None, max_len))
model_ft.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history_ft = model_ft.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10,        
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)



Epoch 1/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 141ms/step - accuracy: 0.7762 - loss: 0.5778 - val_accuracy: 0.8901 - val_loss: 0.3120 - learning_rate: 0.0010
Epoch 2/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 311ms/step - accuracy: 0.8858 - loss: 0.3054 - val_accuracy: 0.9027 - val_loss: 0.2733 - learning_rate: 0.0010
Epoch 3/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 199ms/step - accuracy: 0.9119 - loss: 0.2482 - val_accuracy: 0.9060 - val_loss: 0.2763 - learning_rate: 0.0010
Epoch 4/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 139ms/step - accuracy: 0.9273 - loss: 0.1946 - val_accuracy: 0.9044 - val_loss: 0.2638 - learning_rate: 0.0010
Epoch 5/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 138ms/step - accuracy: 0.9401 - loss: 0.1717 - val_accuracy: 0.9027 - val_loss: 0.2800 - learning_rate: 0.0010
Epoch 6/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0

Save the model

In [None]:
model_ft.save('offensive_detection_finetune.keras')
print("Fine-tuned LSTM model trained and saved")

# GRU Model – Building, Training, and Saving

In [None]:
def create_finetune_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=embedding_dim,weights=[embedding_matrix],input_length=max_len,trainable=True))
    model.add(GRU(128, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(3, activation='softmax'))
    
    optimizer = Adam(learning_rate=1e-3)
    model.compile(loss='sparse_categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    return model

model_ft_gru = create_finetune_gru_model()
model_ft_gru.build(input_shape=(None, max_len))
model_ft_gru.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history_ft_gru = model_ft_gru.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)
model_ft_gru.save('offensive_detection_finetune_gru.keras')
print("Fine-tuned GRU model trained and saved")


Epoch 1/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 109ms/step - accuracy: 0.7880 - loss: 0.5633 - val_accuracy: 0.8886 - val_loss: 0.3055 - learning_rate: 0.0010
Epoch 2/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 113ms/step - accuracy: 0.8856 - loss: 0.2987 - val_accuracy: 0.9012 - val_loss: 0.2677 - learning_rate: 0.0010
Epoch 3/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 113ms/step - accuracy: 0.9107 - loss: 0.2479 - val_accuracy: 0.9049 - val_loss: 0.2619 - learning_rate: 0.0010
Epoch 4/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 107ms/step - accuracy: 0.9291 - loss: 0.1996 - val_accuracy: 0.9082 - val_loss: 0.2659 - learning_rate: 0.0010
Epoch 5/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.9376 - loss: 0.1755
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Simple RNN Model – Building, Training, and Saving

In [None]:
def create_finetune_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words,output_dim=embedding_dim,weights=[embedding_matrix],input_length=max_len,trainable=True))
    model.add(SimpleRNN(128, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(3, activation='softmax'))
    
    optimizer = Adam(learning_rate=1e-3)
    model.compile(loss='sparse_categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    return model

model_ft_rnn = create_finetune_rnn_model()
model_ft_rnn.build(input_shape=(None, max_len))
model_ft_rnn.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history_ft_rnn = model_ft_rnn.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=10,
    validation_split=0.15,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)
model_ft_rnn.save('offensive_detection_finetune_rnn.keras')
print("Fine-tuned Simple RNN model trained and saved")


Epoch 1/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.7907 - loss: 0.5631 - val_accuracy: 0.8796 - val_loss: 0.3278 - learning_rate: 0.0010
Epoch 2/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.8622 - loss: 0.3803 - val_accuracy: 0.8924 - val_loss: 0.3222 - learning_rate: 0.0010
Epoch 3/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.9007 - loss: 0.2930 - val_accuracy: 0.9035 - val_loss: 0.2928 - learning_rate: 0.0010
Epoch 4/10
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9021 - loss: 0.2741 - val_accuracy: 0.9048 - val_loss: 0.3025 - learning_rate: 0.0010
Epoch 5/10
[1m526/527[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.9208 - loss: 0.2286
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

1D CNN Model – Building, Training, and Saving

In [None]:
def create_finetune_cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(Conv1D(128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(3, activation='softmax'))
    
    optimizer = Adam(learning_rate=1e-5)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

model_ft_cnn = create_finetune_cnn_model()
model_ft_cnn.build(input_shape=(None, max_len))
model_ft_cnn.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history_ft_cnn = model_ft_cnn.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)
model_ft_cnn.save('offensive_detection_finetune_cnn.keras')
print("Fine-tuned 1D CNN model trained and saved")


Epoch 1/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7445 - loss: 0.9012 - val_accuracy: 0.7844 - val_loss: 0.7170 - learning_rate: 1.0000e-05
Epoch 2/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.7719 - loss: 0.7220 - val_accuracy: 0.7844 - val_loss: 0.6500 - learning_rate: 1.0000e-05
Epoch 3/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.7758 - loss: 0.6674 - val_accuracy: 0.7844 - val_loss: 0.6239 - learning_rate: 1.0000e-05
Epoch 4/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.7660 - loss: 0.6584 - val_accuracy: 0.7844 - val_loss: 0.6068 - learning_rate: 1.0000e-05
Epoch 5/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.7692 - loss: 0.6420 - val_accuracy: 0.7844 - val_loss: 0.5934 - learning_rate: 1.0000e-05
Epoch 6/10
[1m124/124[0m [32m━━━━━━━━━━━━━

Evaluation, Comparison

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

results = {}

def evaluate_model(model, name):
    # Fine-tuning on the validation split for evaluation
    model.fit(X_tr, y_tr, batch_size=64, epochs=5, validation_data=(X_val, y_val), verbose=0)
    y_pred = model.predict(X_val).argmax(axis=1)
    report = classification_report(y_val, y_pred, output_dict=True)
    results[name] = {
        'accuracy': report['accuracy'],
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1_score': report['weighted avg']['f1-score']
    }
    print(f"{name} Evaluation:")
    print(classification_report(y_val, y_pred))
    print("\n")


loaded_lstm = load_model('offensive_detection_finetune.keras')
evaluate_model(loaded_lstm, 'LSTM')

loaded_gru = load_model('offensive_detection_finetune_gru.keras')
evaluate_model(loaded_gru, 'GRU')

loaded_rnn = load_model('offensive_detection_finetune_rnn.keras')
evaluate_model(loaded_rnn, 'RNN')

loaded_cnn = load_model('offensive_detection_finetune_cnn.keras')
evaluate_model(loaded_cnn, 'CNN')


print("Model Performance :")
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy={metrics['accuracy']:.4f}, "
          f"Precision={metrics['precision']:.4f}, "
          f"Recall={metrics['recall']:.4f}, "
          f"F1 Score={metrics['f1_score']:.4f}")


[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
LSTM Evaluation:
              precision    recall  f1-score   support

           0       0.64      0.57      0.60       119
           1       0.95      0.97      0.96      1577
           2       0.93      0.90      0.92       287

    accuracy                           0.93      1983
   macro avg       0.84      0.81      0.83      1983
weighted avg       0.93      0.93      0.93      1983



[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
GRU Evaluation:
              precision    recall  f1-score   support

           0       0.72      0.47      0.57       119
           1       0.95      0.97      0.96      1577
           2       0.88      0.92      0.90       287

    accuracy                           0.93      1983
   macro avg       0.85      0.79      0.81      1983
weighted avg       0.93      0.93      0.93      1983



[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Submission

In [None]:

best_model_name = max(results, key=lambda x: results[x]['f1_score'])
print("Best model based on F1 Score:", best_model_name)

if best_model_name == 'LSTM':
    best_model = loaded_lstm
elif best_model_name == 'GRU':
    best_model = loaded_gru
elif best_model_name == 'RNN':
    best_model = loaded_rnn
elif best_model_name == 'CNN':
    best_model = loaded_cnn
else:
    print("Unexpected model name.")


test_predictions = best_model.predict(X_test)
predicted_labels = test_predictions.argmax(axis=1)


submission = pd.DataFrame({
    'id': test_df.index,
    'sentiment': predicted_labels
})


submission.to_csv('submission2.csv', index=False)
print("Submission file saved as 'submission2.csv'.")


Best model based on F1 Score: LSTM
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Submission file saved as 'submission2.csv'.
