#### Hate Speech Detection and Classification using Bidirectional LSTM and Bidirectional GRU

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import re
import plotly.express as px

In [2]:
file_path = '/content/drive/MyDrive/hate_speech_data.csv'
data = pd.read_csv(file_path)

**Exploring the Dataset:**

In [3]:
data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [5]:
data.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [6]:
# Selecting only two columns for further process:
data = data[['class', 'tweet']]

In [7]:
data

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [8]:
# Extracting text and class columns
text = data['tweet'].tolist()
clas = data['class'].tolist()
df = pd.DataFrame({'tweet': text, 'class': clas})

**0 - hate speech | 1 - offensive language | 2 - neither**

In [9]:
# Display unique values in the "class" column
unique_classes = data['class'].unique()
print("Unique values in the 'class' column:")
print(unique_classes)

Unique values in the 'class' column:
[2 1 0]


In [11]:
# Display unique values and their counts in the "class" column
class_counts = data['class'].value_counts()
print("Class Label Descriptions:")
print(class_counts)

Class Label Descriptions:
1    19190
2     4163
0     1430
Name: class, dtype: int64


In [12]:
# Checking for Null Values
print(data.isnull().sum())

class    0
tweet    0
dtype: int64


In [13]:
# Convert the 'tweet' column to lowercase
data['tweet'] = data['tweet'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tweet'] = data['tweet'].str.lower()


In [14]:
data

Unnamed: 0,class,tweet
0,2,!!! rt @mayasolovely: as a woman you shouldn't...
1,1,!!!!! rt @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...
3,1,!!!!!!!!! rt @c_g_anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! rt @shenikaroberts: the shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@lifeasking: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like i ain...
24781,1,youu got wild bitches tellin you lies


In [15]:
import string
# Remove punctuation from the 'tweet' column
punctuation_signs = string.punctuation
data['tweet'] = data['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation_signs)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tweet'] = data['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation_signs)))


In [16]:
data

Unnamed: 0,class,tweet
0,2,rt mayasolovely as a woman you shouldnt compl...
1,1,rt mleew17 boy dats coldtyga dwn bad for cuff...
2,1,rt urkindofbrand dawg rt 80sbaby4life you eve...
3,1,rt cganderson vivabased she look like a tranny
4,1,rt shenikaroberts the shit you hear about me ...
...,...,...
24778,1,yous a muthafin lie 8220lifeasking 20pearls co...
24779,2,youve gone and broke the wrong heart baby and ...
24780,1,young buck wanna eat dat nigguh like i aint fu...
24781,1,youu got wild bitches tellin you lies


In [17]:
# Define a function to clean text
def clean_text(text):
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('"', '').replace("'", '')
    text = text.replace("'s", "")
    return text

In [18]:
# Apply the cleaning function to the 'tweet' column
data['tweet'] = data['tweet'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tweet'] = data['tweet'].apply(clean_text)


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from joblib import dump
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from nltk.corpus import stopwords

In [20]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
# Remove stopwords from the 'tweet' column
stop_words = set(stopwords.words('english'))
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    data['tweet'] = data['tweet'].str.replace(regex_stopword, '')

  data['tweet'] = data['tweet'].str.replace(regex_stopword, '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tweet'] = data['tweet'].str.replace(regex_stopword, '')


In [22]:
# Using Bag of Words approach for final data Preparation
cv = CountVectorizer(max_features=75)
X = cv.fit_transform(df['tweet']).toarray()
y = data['class']

In [23]:
# Splitting the Data using Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [24]:
# Train the Decision Tree Classifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Classifier Accuracy: {accuracy:.2f}")

Decision Tree Classifier Accuracy: 0.79


In [25]:
# Train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Classifier Accuracy: {:.2f}".format(accuracy))

Random Forest Classifier Accuracy: 0.83


In [26]:
# Train the AdaBoost Classifier
clf = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"AdaBoost Classifier Accuracy: {accuracy:.2f}")

AdaBoost Classifier Accuracy: 0.84


In [27]:
# Convert labels to categorical format
y_train = to_categorical(y_train, num_classes=3, dtype='float32')
y_test = to_categorical(y_test, num_classes=3, dtype='float32')

**Train Bidirectional LSTM:**

In [28]:
from tensorflow.keras.layers import Bidirectional

# Create and compile the Bidirectional LSTM model
model_bidirectional = Sequential([
    Embedding(232337, 100, input_length=X_train.shape[1]),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(20, dropout=0.2, recurrent_dropout=0.2)),
    Dense(3, activation='softmax')
])

In [29]:
model_bidirectional.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the Bidirectional LSTM model
history_bidirectional_lstm = model_bidirectional.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=25, batch_size=64)

# Save the Bidirectional LSTM model
model_bidirectional.save('bidirectional_lstm.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


  saving_api.save_model(


In [35]:
# Extracting training and validation accuracy, loss from the history
acc = history_bidirectional_lstm.history['accuracy']
val_acc = history_bidirectional_lstm.history['val_accuracy']
loss = history_bidirectional_lstm.history['loss']
val_loss = history_bidirectional_lstm.history['val_loss']
epochs = range(1, len(acc) + 1)

In [36]:
# Convert the range object to a list for Plotly Scatter plot
epochs_list = list(epochs)

# Create a Plotly subplot for accuracy
fig = go.Figure()

# Plotting Training and Validation Accuracy
fig.add_trace(go.Scatter(x=epochs_list, y=acc, mode='markers', name='Training accuracy'))
fig.add_trace(go.Scatter(x=epochs_list, y=val_acc, mode='lines', name='Validation accuracy'))

fig.update_layout(
    title='Training and Validation Accuracy',
    xaxis=dict(title='Epochs'),
    yaxis=dict(title='Accuracy'),
    grid=dict(),
    legend=dict(x=0, y=1, traceorder='normal'),
)

# Create a Plotly subplot for loss
fig2 = go.Figure()

# Plotting Training and Validation Loss
fig2.add_trace(go.Scatter(x=epochs_list, y=loss, mode='markers', name='Training loss'))
fig2.add_trace(go.Scatter(x=epochs_list, y=val_loss, mode='lines', name='Validation loss'))

fig2.update_layout(
    title='Training and Validation Loss',
    xaxis=dict(title='Epochs'),
    yaxis=dict(title='Loss'),
    grid=dict(),
    legend=dict(x=0, y=1, traceorder='normal'),
)
# Display the figures
fig.show()
fig2.show()