## Hate Speech and Offensive Language Detection

In [1]:
import pandas as pd
import numpy as np

In [2]:
#importing the dataset
df = pd.read_csv("hatespeech.csv")

In [3]:
df.head(10)

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [4]:
df.shape

(24783, 6)

In [5]:
df.columns

Index(['count', 'hate_speech_count', 'offensive_language_count',
       'neither_count', 'class', 'tweet'],
      dtype='object')

In [6]:
df=df[['tweet','class']]

In [7]:
df

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1
...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1
24779,"you've gone and broke the wrong heart baby, an...",2
24780,young buck wanna eat!!.. dat nigguh like I ain...,1
24781,youu got wild bitches tellin you lies,1


### Data Pre-processing using NLTK Library

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KUMUDHAA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KUMUDHAA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KUMUDHAA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
import re
def preprocess(text):
  text = re.sub('[^a-zA-Z]', ' ',text)
  tokens = word_tokenize(text.lower())
  return " ".join(WordNetLemmatizer().lemmatize(word) for word in tokens if word not in stopwords.words('english'))

# Apply preprocessing
df['tweet'] = df["tweet"].apply(preprocess)

In [11]:
corpus = df['tweet'].tolist()

In [12]:
corpus[5]

'madison x shit blow claim faithful somebody still fucking hoe'

In [13]:
df

Unnamed: 0,tweet,class
0,rt mayasolovely woman complain cleaning house ...,2
1,rt mleew boy dat cold tyga dwn bad cuffin dat ...,1
2,rt urkindofbrand dawg rt sbaby life ever fuck ...,1
3,rt c g anderson viva based look like tranny,1
4,rt shenikaroberts shit hear might true might f...,1
...,...,...
24778,muthaf lie lifeasking pearl corey emanuel righ...,1
24779,gone broke wrong heart baby drove redneck crazy,2
24780,young buck wan na eat dat nigguh like aint fuc...,1
24781,youu got wild bitch tellin lie,1


### Converting text into numerical vector representations

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=50000) #setting the maximum number of words to be used to 50000 (most frequent)
tokenizer.fit_on_texts(corpus)
tokenized_tweet = tokenizer.texts_to_sequences(corpus)
vector =pad_sequences(tokenized_tweet, maxlen=30, padding='post')
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index)+1

In [15]:
vocab_size

33166

In [16]:
vector

array([[    2, 11802,    97, ...,     0,     0,     0],
       [    2,  7670,    87, ...,     0,     0,     0],
       [    2,  5800,   656, ...,     0,     0,     0],
       ...,
       [  277,  1954,    63, ...,     0,     0,     0],
       [ 5495,     9,   860, ...,     0,     0,     0],
       [ 9980, 33161, 33162, ...,     0,     0,     0]])

In [17]:
X=vector

In [18]:
Y = pd.get_dummies(df['class'], dtype=int).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (24783, 3)


In [19]:
Y

array([[0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [20]:
Y[50]

array([0, 1, 0])

In [21]:
df['tweet']

0        rt mayasolovely woman complain cleaning house ...
1        rt mleew boy dat cold tyga dwn bad cuffin dat ...
2        rt urkindofbrand dawg rt sbaby life ever fuck ...
3              rt c g anderson viva based look like tranny
4        rt shenikaroberts shit hear might true might f...
                               ...                        
24778    muthaf lie lifeasking pearl corey emanuel righ...
24779      gone broke wrong heart baby drove redneck crazy
24780    young buck wan na eat dat nigguh like aint fuc...
24781                       youu got wild bitch tellin lie
24782    ruffled ntac eileen dahlia beautiful color com...
Name: tweet, Length: 24783, dtype: object

In [22]:
y=np.array(Y)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [24]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [25]:
from tensorflow.keras.layers import Dropout, Bidirectional

embedding_vector_dim=50 ##features representation
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_dim,input_length=30))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(50)))
model.add(Dropout(0.3))
#model.add(LSTM(50))
#model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [26]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min', restore_best_weights=True)
model.fit(X_train, y_train, validation_split=0.33, epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.7608 - loss: 0.6790 - val_accuracy: 0.8916 - val_loss: 0.3365
Epoch 2/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9022 - loss: 0.3111 - val_accuracy: 0.9024 - val_loss: 0.3128
Epoch 3/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9252 - loss: 0.2291 - val_accuracy: 0.8951 - val_loss: 0.3364
Epoch 4/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9342 - loss: 0.1868 - val_accuracy: 0.8845 - val_loss: 0.3721
Epoch 5/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9656 - loss: 0.1212 - val_accuracy: 0.8907 - val_loss: 0.4517
Epoch 6/10
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9769 - loss: 0.0794 - val_accuracy: 0.8790 - val_loss: 0.4518
Epoch 7/10
[1m174/174

<keras.src.callbacks.history.History at 0x1fae4c2a080>

In [27]:
y_pred=model.predict(X_test)
y_pred=np.where(y_pred > 0.5, 1,0)
from sklearn.metrics import accuracy_score, classification_report
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy score : ", accuracy_score(y_test,y_pred))

[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       465
           1       0.92      0.96      0.94      6335
           2       0.84      0.80      0.82      1379

   micro avg       0.91      0.88      0.89      8179
   macro avg       0.59      0.59      0.59      8179
weighted avg       0.86      0.88      0.87      8179
 samples avg       0.88      0.88      0.88      8179

Accuracy score :  0.8792028365325835


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
def predict_on_user_input():
  user_text = input("Enter your text: ")
  user_text = preprocess(user_text)
  tokens = user_text.split()
  encoded_sequence = [word_index.get(token, 0) for token in tokens]  
  padded_sequence = pad_sequences([encoded_sequence], maxlen=30, padding='post')

 
  prediction = model.predict(padded_sequence)
  if prediction[0][0] > 0.5:
    print("Highly offensive and Hate speech detected")
  elif prediction[0][1] > 0.5:
    print("Hate speech detected")
  else:
    print("Positive")
  

predict_on_user_input()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Hate speech detected


In [30]:
import pickle 
pickle.dump(model,open('model.pkl','wb'))

In [31]:
pickle.dump(word_index,open('word_index.pkl','wb'))