In [8]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight

In [9]:
VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

In [10]:
train_file_path = "hate_train.csv"
test_file_path = "hate_test.csv"
validation_file_path = "hate_validation.csv"

In [11]:
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
validation_data = pd.read_csv(validation_file_path)

In [12]:
train_data['text'] = train_data['text'].astype(str)
test_data['text'] = test_data['text'].astype(str)

In [14]:
train_text = train_data.text.values
test_text = test_data.text.values
train_labels = train_data.label.values
test_labels = test_data.label.values

In [15]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_text)

train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(train_sequences, maxlen=MAXLEN)

test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_sequences, maxlen=MAXLEN)

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32, input_length=MAXLEN),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])


# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),  # Fixed learning rate
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Print model summary to check architecture
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 32)           2834688   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              164864    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3,016,065
Trainable params: 3,016,065
Non-trainable params: 0
______________________________________________

In [17]:
# Define callbacks for early stopping and model checkpointing
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,  # Increased patience
    restore_best_weights=True,
    verbose=1
)

# Optionally, define a learning rate scheduler (to decay the learning rate during training)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-3 * 0.9 ** epoch,  # Decreases the learning rate by 10% every epoch
    verbose=1
)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weight_dict = dict(enumerate(class_weights))

# Model training with the callbacks added
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weight_dict,  # Class weights added
    callbacks=[early_stopping]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping


In [18]:
results = model.evaluate(test_padded, test_labels)
print(results)

[1.462143063545227, 0.4599326550960541]


In [19]:
# Function to predict whether a message is positive (0) or negative (1)
def predict_message(message):
    # Convert the message into a sequence of tokens
    pred_sequence = tokenizer.texts_to_sequences([message])
    
    # Pad the sequence to the same length as the model expects
    pred_padded = pad_sequences(pred_sequence, maxlen=MAXLEN)
    
    # Make the prediction using the model
    prediction = model.predict(pred_padded)  # The model outputs probabilities for class 1 (negative)
    
    # Extract the predicted probability for 'negative' (class 1)
    negative_probability = prediction[0][0]  # For a binary classifier, this will be a number between 0 and 1
    
    # Determine the label based on the threshold (usually 0.5)
    if negative_probability >= 0.5:  # If probability >= 0.5, classify as negative (1)
        label = "negative"  # Class 1
    else:
        label = "positive"  # Class 0
    
    # Return the probability and the corresponding label
    return [negative_probability, label]

# Example usage
text = "I love this product, it's amazing!"
result = predict_message(text)
print(result)  # Output will show the probability and the label ("positive" or "negative")


[0.22383575, 'positive']


In [20]:
# Function to test the model on the validation set
def test_predictions():
    # Extract the validation messages and labels
    validation_text = validation_data['text'].astype(str).values
    validation_labels = validation_data['label'].values
    
    # Initialize variables to track the results
    correct = 0
    total = len(validation_text)
    
    # Loop through the validation set
    for msg, actual_label in zip(validation_text, validation_labels):
        prediction = predict_message(msg)
        predicted_label = prediction[1]
        
        # Map actual label to "positive" (0) or "negative" (1)
        if actual_label == 0:
            actual_label_str = "positive"
        else:
            actual_label_str = "negative"
        
        # Compare the predicted label with the actual label
        if predicted_label == actual_label_str:
            correct += 1
    
    # Print the accuracy
    accuracy = correct / total
    print(f"Model accuracy on validation set: {accuracy * 100:.2f}%")

# Call this function to test your model
test_predictions()














Model accuracy on validation set: 68.00%
