**HATE SPEECH DETECTION USING LONG SHORT TERM MEMORY (LSTM)**

In [1]:
import pandas as pd
import spacy
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras


In [2]:
df = pd.read_csv('/content/dataset.csv')

In [3]:
df.shape

(24783, 7)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
df.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [6]:
# dropping unwanted columns
df.drop(columns = ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],
       inplace = True)

In [7]:
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [8]:
# checking for null values
df.isna().sum()

Unnamed: 0,0
class,0
tweet,0


In [9]:
df['tweet'].iloc[0]

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

In [10]:
df['tweet'].iloc[100]

'"@ClicquotSuave: LMAOOOOOOOOOOO this nigga @Krillz_Nuh_Care http://t.co/AAnpSUjmYI" &lt;bitch want likes for some depressing shit..foh'

In [11]:
df['tweet'].iloc[1000]

'&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;"@betysweetcocker: That pussy is just....&#128561; imma assume she just had a baby like..the day before"'

In [12]:
# deleting unwanted symbols and numeric data
df['processed_tweet'] = df['tweet'].str.replace(r'[^a-zA-Z]',' ',regex=True)

In [13]:
df.head()

Unnamed: 0,class,tweet,processed_tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldn t...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew boy dats cold tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT C G Anderson viva based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you...


In [14]:
df['processed_tweet'].iloc[1000]

'                                                                 betysweetcocker  That pussy is just              imma assume she just had a baby like  the day before '

In [15]:
# handling unwanted space
df['processed_tweet_2'] = df['processed_tweet'].str.replace(r'\s+', ' ', regex=True)

In [16]:
df['processed_tweet_2'].iloc[1000]

' betysweetcocker That pussy is just imma assume she just had a baby like the day before '

In [17]:
# deleting unwanted columns
df.drop(columns=['tweet', 'processed_tweet'], inplace=True)


In [18]:
df.head()

Unnamed: 0,class,processed_tweet_2
0,2,RT mayasolovely As a woman you shouldn t comp...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,RT C G Anderson viva based she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...


In [19]:
# NLP
nlp = spacy.load('en_core_web_sm')

In [20]:
# lemmatization
def lemmatization(text):
    doc = nlp(text)
    lemmaList = (word.lemma_ for word in doc)
    return ' '.join(lemmaList)

In [21]:
df['lemma_tweet'] = df['processed_tweet_2'].apply(lemmatization)

In [22]:
df.head()

Unnamed: 0,class,processed_tweet_2,lemma_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely as a woman you shouldn t com...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life you ever...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva base she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts the shit you hear about I ...


In [23]:
df['processed_tweet_2'].iloc[67]

' Allyhaaaaa Lemmie eat a Oreo amp do these dishes One oreo Lol'

In [24]:
df['lemma_tweet'].iloc[67]

'  Allyhaaaaa Lemmie eat a Oreo amp do these dish one oreo lol'

In [25]:
# removing stop words
def remove_stopwords(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [26]:
df['final_tweet'] = df['lemma_tweet'].apply(remove_stopwords)

In [27]:
df['final_tweet'].iloc[67]

'   Allyhaaaaa Lemmie eat Oreo amp dish oreo lol'

In [28]:
df.head()

Unnamed: 0,class,processed_tweet_2,lemma_tweet,final_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely as a woman you shouldn t com...,RT mayasolovely woman shouldn t complain cl...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad cuffin d...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life you ever...,RT UrKindOfBrand Dawg RT sbaby life fuck bi...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva base she look like a tr...,RT C G Anderson viva base look like tranny
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts the shit you hear about I ...,RT ShenikaRoberts shit hear true faker bitc...


In [29]:
# one hot representation
vocab_size = 10000
one_hot_rep = [one_hot(words, vocab_size) for words in df['final_tweet']]

In [30]:
df['final_tweet'].iloc[0]

'   RT mayasolovely woman shouldn t complain clean house amp man trash'

In [31]:
one_hot_rep[0]

[7568, 2882, 2886, 4738, 5614, 7861, 7174, 225, 7946, 806, 5904]

In [32]:
for i in range(0, 4):
    print(df['final_tweet'].iloc[i])

   RT mayasolovely woman shouldn t complain clean house amp man trash
   RT mleew boy dat cold tyga dwn bad cuffin dat hoe st place
   RT UrKindOfBrand Dawg RT sbaby life fuck bitch start cry confuse shit
   RT C G Anderson viva base look like tranny


In [33]:
for i in range(0, 4):
   print(one_hot_rep[i])

[7568, 2882, 2886, 4738, 5614, 7861, 7174, 225, 7946, 806, 5904]
[7568, 2003, 8087, 7121, 9809, 1086, 1508, 5935, 6576, 7121, 2227, 76, 2622]
[7568, 5223, 6694, 7568, 2105, 1876, 8130, 5823, 7239, 4516, 495, 4474]
[7568, 1436, 9774, 6502, 59, 8689, 6794, 6510, 3365]


In [34]:
sentence_length = 20
embedded_tweet = pad_sequences(one_hot_rep, padding='pre', maxlen=sentence_length)

In [35]:
for i in range(0, 4):
   print(embedded_tweet[i])

[   0    0    0    0    0    0    0    0    0 7568 2882 2886 4738 5614
 7861 7174  225 7946  806 5904]
[   0    0    0    0    0    0    0 7568 2003 8087 7121 9809 1086 1508
 5935 6576 7121 2227   76 2622]
[   0    0    0    0    0    0    0    0 7568 5223 6694 7568 2105 1876
 8130 5823 7239 4516  495 4474]
[   0    0    0    0    0    0    0    0    0    0    0 7568 1436 9774
 6502   59 8689 6794 6510 3365]


In [36]:
X = np.array(embedded_tweet)
y = np.array(df['class'])

In [37]:
'''0 - Negative speech
   1 - offensive speech
   2 - neither '''
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,19190
2,4163
0,1430


In [38]:
smote = SMOTE(sampling_strategy='minority')
X, y = smote.fit_resample(X, y)

In [39]:
# train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
X.shape, X_train.shape, X_test.shape

((42543, 20), (34034, 20), (8509, 20))

In [41]:
# Creating model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Use the vocab_size you used during one-hot encoding
vocab_size = 10000  # This should match the vocab_size used for one_hot_rep

embedding_dim = 128
max_length = sentence_length # Use sentence_length from previous code (20)
lstm_units = 64

# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(lstm_units, return_sequences=True),
    LSTM(lstm_units, return_sequences=True),
    LSTM(lstm_units),
    Dense(3, activation='softmax') # Changed to 3 outputs with softmax for 3 classes
])

# Force model building by specifying input shape
model.build(input_shape=(None, max_length))

# Print model summary
model.summary()





In [42]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [43]:
model.fit(X_train, y_train, epochs=10, batch_size=32 )

Epoch 1/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.7940 - loss: 0.4719
Epoch 2/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.9502 - loss: 0.1581
Epoch 3/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.9659 - loss: 0.1088
Epoch 4/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.9785 - loss: 0.0753
Epoch 5/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9864 - loss: 0.0457
Epoch 6/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9895 - loss: 0.0311
Epoch 7/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.9945 - loss: 0.0187
Epoch 8/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9963 - loss: 0.0122
Epoch 9/10
[1m1

<keras.src.callbacks.history.History at 0x7d492da18b90>

In [44]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Accuracy : {accuracy * 100}')

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9008 - loss: 0.5408
Model Accuracy : 89.57574367523193


In [45]:
pred = np.argmax(model.predict(X_test),axis = -1)

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [46]:
y_test[:15]

array([1, 1, 0, 1, 2, 0, 0, 1, 0, 1, 2, 0, 1, 1, 0])

In [47]:
pred[:15]

array([1, 1, 0, 1, 2, 0, 0, 1, 0, 1, 2, 0, 1, 1, 0])