### Importing necessary libraries

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#to data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

#train split and fit models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#model selection
from sklearn.metrics import confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mridu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Importing the dataset

In [3]:
data = pd. read_csv("D:\\Kaggle Project\\NLP hate speech detection\\labeled_data.csv")

### Previewing the data

In [4]:
print(data. head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [5]:
data["labels"] = data["class"]. map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
data = data[["tweet", "labels"]]
print(data. head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No Hate and Offensive Speech  
1              Offensive Speech  
2              Offensive Speech  
3              Offensive Speech  
4              Offensive Speech  


In [6]:
stopword=set(stopwords.words('english'))
stemmer = nltk. SnowballStemmer("english")

In [7]:
def clean (text):
    text = str(text). lower()
    text = re.sub('[.?]', '', text)
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

In [20]:
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)

### Splitting the Data

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

### Finding the best model to predict hate speech

##### Decision Tree 

In [37]:
dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

DecisionTreeClassifier()

##### Random Forest 

In [38]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

RandomForestClassifier()

##### Logistic Regression

In [41]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Making the confusion matrix for each model

##### Decision Tree

In [43]:
y_pred_dt = dt.predict(X_test)
cm = confusion_matrix(y_test, y_pred_dt)
print(cm)

[[ 155   50  260]
 [  40 1174  165]
 [ 240  246 5849]]


##### Random Forest

In [44]:
y_pred_rf = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

[[ 123   35  307]
 [  14 1097  268]
 [ 111  160 6064]]


##### Logistic Regression

In [45]:
y_pred_lr = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

[[ 113   53  299]
 [   7 1190  182]
 [ 113  203 6019]]


In [46]:
rf_score = accuracy_score(y_test, y_pred_rf)
lr_score = accuracy_score(y_test, y_pred_lr)
dt_score = accuracy_score(y_test, y_pred_dt)

print('Random Forest Accuracy: ', str(rf_score))
print('Logistic Regression Accuracy: ', str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))

Random Forest Accuracy:  0.8905734197334637
Logistic Regression Accuracy:  0.8952194644822106
Decision Tree Accuracy:  0.8776134001711701


We can see that Logistic Regression gives the best accuracy of all the models. But it is important to notice that Random Forest and Decision Tree had great results as well.

### Predicting the outcome

In [69]:
inp = "You are too bad and I dont like your attitude"
inp = cv.transform([inp]).toarray()
print(lr.predict(inp))

['Offensive Speech']


In [67]:
inp = "It is really awesome"
inp = cv.transform([inp]).toarray()
print(lr.predict(inp))

['No Hate and Offensive Speech']


We can see that the logistic regression model predicts the input correctly.