In [1]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib as plt
import matplotlib.pyplot as plt
import plotly.express as px
from collections import Counter
import emoji
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score


In [2]:
df = pd.read_csv("Hate Speech.tsv", sep= "\t", index_col='id')
df.head(100)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is so...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation
...,...,...
96,0,@user i'll always hope that one day i'll get t...
97,0,#model i love u take with u all the time in ...
98,0,couple having sex fat naked japanese girls
99,0,#hump on that #hump day #humpersð© @ edwa...


In [3]:
X=df.drop("label",axis=1)
y=df.label

In [4]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)

In [5]:
class TextCleaningTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column='tweet', download_nltk=True, return_tokens=True,join_tokens=True):
        """
        A custom transformer for cleaning text data

        Parameters:
        -----------
        text_column : str, default='tweet'
            The name of the column containing text to clean
        download_nltk : bool, default=True
            Whether to download NLTK resources
        return_tokens : bool, default=True
            Whether to return tokenized text or just cleaned text
        """
        self.text_column = text_column
        self.download_nltk = download_nltk
        self.return_tokens = return_tokens
        self.join_tokens = join_tokens

        # Download NLTK resources if needed
        if self.download_nltk:
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)

        self.stop_words = set(stopwords.words('english'))

    def fix_encoding(self, text):
        """Fix potential encoding issues"""
        if not isinstance(text, str):
            return ""
        try:
            return text.encode('latin1').decode('utf-8')
        except Exception:
            return text

    def clean_text(self, text):
        """Clean text by removing unwanted elements"""
        if not isinstance(text, str):
            return ""

        # Fix encoding
        text = self.fix_encoding(text)
        # Lowercase
        text = text.lower()
        # Remove @user
        text = re.sub(r'@[\w_]+', '', text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        # Remove hashtags (keep word)
        text = re.sub(r'#', '', text)
        # Convert emojis to text
        text = emoji.demojize(text, language='en')
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove special characters except colons in emojis
        text = re.sub(r'[^\w\s:]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize_and_remove_stopwords(self, text):
        """Tokenize text and remove stopwords"""
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word not in self.stop_words]
        return filtered_tokens

    def fit(self, X, y=None):
        """Fit method (does nothing but required by sklearn API)"""
        return self

    def transform(self, X):
        """Transform the input data by cleaning the text"""
        X_transformed = X.copy()

        # Step 1: Fix encoding
        X_transformed['clean_text'] = X_transformed[self.text_column].apply(self.fix_encoding)

        # Step 2: Clean the text
        X_transformed['clean_text'] = X_transformed['clean_text'].apply(self.clean_text)

        # Step 3: Tokenize and remove stopwords if requested
        if self.return_tokens:
            X_transformed['tokens'] = X_transformed['clean_text'].apply(self.tokenize_and_remove_stopwords)
        if self.join_tokens:
            X_transformed['final'] = X_transformed['tokens'].apply(lambda tokens: ' '.join(tokens))

        return X_transformed

    def fit_transform(self, X, y=None):
        """Combine fit and transform methods"""
        return self.fit(X).transform(X)

In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
text_cleaner = TextCleaningTransformer(text_column='tweet', download_nltk=True, return_tokens=True)

# Apply fit_transform on x_train
x_train_cleaned = text_cleaner.fit_transform(x_train)

# Apply only transform on x_test (using the fitted transformer)
x_test_cleaned = text_cleaner.transform(x_test)

In [8]:
df_train=x_train_cleaned["final"]
df_test=x_test_cleaned["final"]

In [9]:

tokenizer = Tokenizer(num_words = 50000, split = ' ', lower = True, oov_token = 'UNK')
tokenizer.fit_on_texts(df_train)
train = tokenizer.texts_to_sequences(df_train)
print(train[0])
x = pad_sequences(train)
print(x[0])
print(x.shape[1])

[122, 1288, 10, 962, 3146, 13165, 2107, 815, 815]
[   0    0    0 ... 2107  815  815]
1316


In [10]:

train_padded_sequences = pad_sequences(
    train,
    maxlen=50,
    padding='post',
    truncating='post'
)

print(train_padded_sequences.shape)
print(train_padded_sequences[0])

(25228, 50)
[  122  1288    10   962  3146 13165  2107   815   815     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [11]:
test= tokenizer.texts_to_sequences(df_test)

In [12]:
test_padded_sequences = pad_sequences(test, maxlen=50, padding='post', truncating='post')

In [13]:
gru_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(50000, 16),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.15),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [14]:


early_stop = EarlyStopping(
    monitor='val_loss',     # you can also use 'val_accuracy'
    patience=8,             # stop if no improvement for 5 epochs
    restore_best_weights=True,  # restores best model (not just last)
    verbose=1
)


In [15]:
gru_bi.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(0.001),
              metrics=['accuracy'])

In [None]:
y_test=y_test.to_numpy()
y_train=y_train.to_numpy()

In [16]:
history = gru_bi.fit(train_padded_sequences, y_train, epochs=30, batch_size=32,
                    validation_data=(test_padded_sequences, y_test),callbacks=[early_stop])


Epoch 1/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 37ms/step - accuracy: 0.9320 - loss: 0.2483 - val_accuracy: 0.9573 - val_loss: 0.1288
Epoch 2/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 37ms/step - accuracy: 0.9820 - loss: 0.0581 - val_accuracy: 0.9581 - val_loss: 0.1273
Epoch 3/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.9938 - loss: 0.0215 - val_accuracy: 0.9474 - val_loss: 0.1927
Epoch 4/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 37ms/step - accuracy: 0.9966 - loss: 0.0118 - val_accuracy: 0.9317 - val_loss: 0.2659
Epoch 5/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.9985 - loss: 0.0066 - val_accuracy: 0.9475 - val_loss: 0.3102
Epoch 6/30
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 41ms/step - accuracy: 0.9990 - loss: 0.0042 - val_accuracy: 0.9445 - val_loss: 0.2299
Epoch 7/30
[1m7

In [17]:
p = gru_bi.predict(test_padded_sequences,verbose=1)
predicted = [int(round(x[0])) for x in p]


[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step


In [18]:
f1 = f1_score(y_test, predicted,average="macro")
print("F1 Score:", f1)


F1 Score: 0.8380695411460728


In [19]:
# Save the model
gru_bi.save("hate_speech_Gru_model.h5")

# Save tokenizer
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)




In [22]:
def predict_hate(text, threshold=0.2):
    """
    Predict if the given text is hate speech or not.

    Args:
        text (str): Input text.
        threshold (float): Probability threshold for classification.

    Returns:
        dict: Contains label and probability.
    """
    # Convert to sequence and pad
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=50, padding='post', truncating='post')

    # Predict
    prob = gru_bi.predict(padded)[0][0]
    label = "Hate Speech" if prob >= threshold else "Not Hate Speech"

    return {
        "label": label,
        "probability": float(round(prob, 4))
    }

# 💡 Example usage:
print(predict_hate("You people are nothing but a plague to this country. Always whining and ruining everything. Go back to where you came from!"))
print(predict_hate("Hope you have a great day!"))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
{'label': 'Hate Speech', 'probability': 0.6592000126838684}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
{'label': 'Not Hate Speech', 'probability': 0.19470000267028809}
