In [231]:
#importing all the required libraries
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split 

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output

import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [232]:
from sklearn.model_selection import KFold, cross_val_score

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split


###  Grid-search
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [234]:
#Load data
data = pd.read_csv('TwitterHate.csv')

In [235]:
data.shape

(31962, 3)

In [236]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [237]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [238]:
data = data[['label', 'tweet']]
data

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
...,...,...
31957,0,ate @user isz that youuu?ðððððð...
31958,0,to see nina turner on the airwaves trying to...
31959,0,listening to sad songs on a monday morning otw...
31960,1,"@user #sikh #temple vandalised in in #calgary,..."


In [239]:
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [240]:
#checking null values
data.isnull().sum()

label    0
tweet    0
dtype: int64

In [241]:
data['tweet'].head(10)

0     @user when a father is dysfunctional and is s...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model   i love u take with u all the time in ...
4               factsguide: society now    #motivation
5    [2/2] huge fan fare and big talking before the...
6     @user camping tomorrow @user @user @user @use...
7    the next school year is the year for exams.ð...
8    we won!!! love the land!!! #allin #cavs #champ...
9     @user @user welcome here !  i'm   it's so #gr...
Name: tweet, dtype: object

In [242]:
#Removing user handles @ 
data['tweet'] =data['tweet'].apply(lambda x : ' '.join([tweet for tweet in x.split()if not tweet.startswith("@")]))

In [243]:
#removing '#' hashtag while retaining the term
data['tweet'] =data['tweet'].str.replace('#','')

In [244]:
#Removing all the greek characters using unidecode library
import unidecode
data['tweet'] = data['tweet'].apply(lambda x : ' '.join([unidecode.unidecode(word) for word in x.split()]))

In [245]:
#regular expressions, remove URLs.
data['tweet']=data['tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [246]:
data['tweet']

0        when a father is dysfunctional and is so selfi...
1        thanks for lyft credit i can't use cause they ...
2                                      bihday your majesty
3        model i love u take with u all the time in urd...
4                       factsguide: society now motivation
                               ...                        
31957                    ate isz that youuu?ddddddddda$?i,
31958    to see nina turner on the airwaves trying to w...
31959    listening to sad songs on a monday morning otw...
31960    sikh temple vandalised in in calgary, wso cond...
31961                             thank you for you follow
Name: tweet, Length: 31962, dtype: object

In [247]:
#get all the tokenized terms into one large list
from nltk.tokenize import word_tokenize
tok=list(data['tweet'])
totlist=[]
for i in tok:
    totlist.append(list(word_tokenize(i)))
flat_list = [item for sublist in totlist for item in sublist if len(item)>1]
flat_list[:10]

['when',
 'father',
 'is',
 'dysfunctional',
 'and',
 'is',
 'so',
 'selfish',
 'he',
 'drags']

In [248]:
#find the 10 most common terms
from collections import Counter
tenmostcommonwords = Counter(flat_list)
tenmostcommonwords.most_common(10)

[('the', 10153),
 ('to', 9831),
 ('you', 5851),
 ('and', 4891),
 ('in', 4638),
 ('for', 4488),
 ('is', 4286),
 ('of', 4166),
 ('my', 3678),
 ('it', 3550)]

In [249]:
stopwords = nltk.corpus.stopwords.words('english')

In [250]:
ps = nltk.PorterStemmer()

In [251]:
#removing punctuations,case normalizing tockenizing the words stemming and checking stop words and returing the string
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords if len(word)>1]
    return text

In [252]:
#implementing vectorization
tfidf_vect = TfidfVectorizer(analyzer=clean_text,  max_features =5000)
X_tfidf = tfidf_vect.fit_transform(data['tweet'])

In [253]:
X_tfidf

<31962x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 199245 stored elements in Compressed Sparse Row format>

In [254]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [255]:
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [256]:
X_features.shape

(31962, 5000)

In [257]:
feauture_name = tfidf_vect.get_feature_names()
pd.DataFrame(X_tfidf.toarray(), columns = feauture_name)

Unnamed: 0,01,02,03,05,10,100,1000,10k,11,11th,...,zara,zealand,zelda,zen,zero,zionazi,zionism,zone,zoo,zoro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], train_size = 0.8, random_state = 123)

In [259]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

25569
6393
25569
6393


In [260]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
4039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
#applying simple logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lr = LogisticRegression(solver = 'liblinear', random_state = 42, max_iter=1000)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred),3))

Accuracy:  0.955
F1:  0.528


In [262]:
#test data metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[5947   13]
 [ 273  160]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5960
           1       0.92      0.37      0.53       433

    accuracy                           0.96      6393
   macro avg       0.94      0.68      0.75      6393
weighted avg       0.95      0.96      0.95      6393



In [263]:
#train data metrics
y_pred_train = lr.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

[[23724    36]
 [ 1079   730]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     23760
           1       0.95      0.40      0.57      1809

    accuracy                           0.96     25569
   macro avg       0.95      0.70      0.77     25569
weighted avg       0.96      0.96      0.95     25569



In [264]:
#balancing the class
lr = LogisticRegression(solver = 'liblinear',class_weight='balanced', random_state = 42, max_iter=1000)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred),3))

Accuracy:  0.925
F1:  0.601


In [265]:
#test data metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[5551  409]
 [  71  362]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      5960
           1       0.47      0.84      0.60       433

    accuracy                           0.92      6393
   macro avg       0.73      0.88      0.78      6393
weighted avg       0.95      0.92      0.93      6393



In [266]:
#train data metrics
y_pred_train = lr.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

[[22461  1299]
 [   51  1758]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     23760
           1       0.58      0.97      0.72      1809

    accuracy                           0.95     25569
   macro avg       0.79      0.96      0.85     25569
weighted avg       0.97      0.95      0.95     25569



In [267]:
# define models and parameters with Regularization and Hyperparameter tuning:
model = LogisticRegression(class_weight='balanced')
solvers = ['liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='recall',error_score=0)
grid_result = grid_search.fit(X_train,y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.793984 using {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.693938 (0.024717) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.737973 (0.020786) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.791593 (0.019726) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.793984 (0.024251) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.764135 (0.028463) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
#so the best score: 0.793984 and best parameters using {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}

In [268]:
y_pred = lr.predict(X_test)

In [269]:
#test data metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[5551  409]
 [  71  362]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      5960
           1       0.47      0.84      0.60       433

    accuracy                           0.92      6393
   macro avg       0.73      0.88      0.78      6393
weighted avg       0.95      0.92      0.93      6393



In [None]:
#recall on the toxic comments is 0.93
#f1-score is 0.96