# Hate Speech Detection Using LSTM in PyTorch

In [260]:
import pandas as pd
import numpy as np

In [261]:
df = pd.read_csv('data/labeled_data.csv')

df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [262]:
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

In [263]:
text = []
clas = []

text = df['tweet'].tolist()
clas = df['class'].tolist()

In [264]:
df = pd.DataFrame({'tweet': text, 'class': clas})

In [265]:
print(df.isnull().sum())

tweet    0
class    0
dtype: int64


In [266]:
df['tweet'] = df['tweet'].apply(lambda x:x.lower())

In [267]:
punctuation_signs = list("?:!.,;")
df['tweet'] = df['tweet']

for punct_sign in punctuation_signs:
    df['tweet'] = df['tweet'].str.replace(punct_sign, '')

  df['tweet'] = df['tweet'].str.replace(punct_sign, '')


In [268]:
df['tweet'] = df['tweet'].apply(lambda x: x.replace('\n', ' '))
df['tweet'] = df['tweet'].apply(lambda x: x.replace('\t', ' '))
df['tweet'] = df['tweet'].str.replace("    ", " ")
df['tweet'] = df['tweet'].str.replace('"', '')
df['tweet'] = df['tweet'].str.replace("'s", "")

In [269]:
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['tweet'] = df['tweet'].str.replace(regex_stopword, '')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df['tweet'] = df['tweet'].str.replace(regex_stopword, '')


In [270]:
cv = CountVectorizer(max_features = 75)
X = cv.fit_transform(df['tweet']).toarray()
y = df['class']

In [271]:
X, y

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 0        2
 1        1
 2        1
 3        1
 4        1
         ..
 24778    1
 24779    2
 24780    1
 24781    1
 24782    2
 Name: class, Length: 24783, dtype: int64)

In [272]:
import torch
from torch import nn
import torch.optim as optim
import matplotlib.pyplot as plt


print(torch.__version__)

# Setting up device agnostic
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

2.1.0+cu121
cuda


## Spliting Datasets

In [273]:
X.dtype, y.dtype

(dtype('int64'), dtype('int64'))

In [274]:
# Conversion from int64 to long
X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

X, y

(tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([2, 1, 1,  ..., 1, 1, 2]))

In [275]:
train_split = int(0.8* len(X))
X_train, y_train = X[:train_split], y[:train_split]
X_test, y_test = X[train_split:], y[train_split:]

len(X_train),len(y_train), len(X_test), len(y_test)

(19826, 19826, 4957, 4957)

In [276]:
# y_train = y_train.astype('float32')
# y_test = y_test.astype('float32')

y_train.dtype, y_test.dtype

(torch.int64, torch.int64)

## Neural Network Model

In [277]:
class LSTMModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
    super(LSTMModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.dropout = nn.Dropout(0.2)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    embedded = self.embedding(x)
    embedded = self.dropout(embedded)
    lstm_out, _ = self.lstm(embedded)
    lstm_out = lstm_out[:, -1, :]
    fc_out = self.fc(lstm_out)
    output = self.softmax(fc_out)
    return output

In [278]:
vocab_size = 232337
embedding_dim = 100
hidden_dim = 20
output_dim = 3

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model

LSTMModel(
  (embedding): Embedding(232337, 100)
  (lstm): LSTM(100, 20, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=20, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)

In [282]:
next(model.parameters()).device

device(type='cpu')

In [283]:
# Set model usage to CUDA
model.to(device)
next(model.parameters()).device

device(type='cuda', index=0)

In [289]:
# Loss function and optimizers
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [291]:
epochs = 1000
batch_size = 64

# Placing data on CUDA
X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)

# Train loop
for epoch in range(epochs):
  model.train()
  y_pred = model(X_train)
  loss = loss_fn(y_pred, y_train)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # Testing
  model.eval()
  with torch.inference_mode():
    test_pred = model(X_test)
    test_loss = loss_fn(test_pred, y_test)

  if epoch % 100 == 0 or epoch == epochs-1:
    print(f'Epoch: {epoch} | Train Loss: {loss:.4f} | Test Loss: {test_loss:.4f}')

Epoch: 0 | Train Loss: 0.7817 | Test Loss: 0.7591
Epoch: 100 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 200 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 300 | Train Loss: 0.7816 | Test Loss: 0.7590
Epoch: 400 | Train Loss: 0.7816 | Test Loss: 0.7590
Epoch: 500 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 600 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 700 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 800 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 900 | Train Loss: 0.7817 | Test Loss: 0.7590
Epoch: 999 | Train Loss: 0.7817 | Test Loss: 0.7590
