In [1]:
import pandas as pd
from math import log

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from numpy import reshape,asarray,shape

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier


import re

tweet = pd.read_csv('D:\\Projects\\AI\\Twitter Hate Speech\\TwitterHate.csv')
tweet.drop('id',axis=1,inplace=True)
df = tweet.copy()


In [2]:
df['tweet'] = df['tweet'].str.lower()

df['tweet'].replace(r'@\w+','',regex=True,inplace=True)

df['tweet'].replace(r'http\S+','',regex=True,inplace=True)

def rem_hash_punct(text):
    text = ''.join(text)
    clean_text = re.sub(r"#",'',text)
    clean_text = re.sub(r"[^\w\s]", "", text)
    return clean_text

def rem_digits(text):
    no_digits = []
    for word in text:
        no_digits.append(re.sub(r'\d','',word))
    return ''.join(no_digits)

df['tweet'] = df['tweet'].apply(rem_hash_punct)
df['tweet'] = df['tweet'].apply(rem_digits)


tokenizer = TweetTokenizer(preserve_case=True)
df['tweet'] = df['tweet'].apply(tokenizer.tokenize)

stop_words = stopwords.words('english')

def remove_stopwords(text):
    clean_text = [word for word in text if not word in stop_words]
    return clean_text  

df['tweet'] = df['tweet'].apply(remove_stopwords)

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

def remove_Lemm(tokens):
    for token in tokens:
        token = wordnet_lemmatizer.lemmatize(token, pos="v")
    return tokens
        
df['tweet'] = df['tweet'].apply(remove_Lemm)

def rem_nonalpha(text):
    text = [word for word in text if word.isalpha()]
    return text

df['tweet'] = df['tweet'].apply(rem_nonalpha)

In [4]:
def listToString(s):
    str1 = ""

    for ele in s:
        str1 += ele
        str1 += " "

    return str1

df['tweet'] = df['tweet'].apply(listToString)

In [5]:
cnt_Normal = df[df['label'] == 0]['tweet'].count()
Hate = df[df['label'] == 1]
Normal = df[df['label'] == 0]
Hate_oversample = Hate.sample(cnt_Normal, replace=True)
oversampled = pd.concat([Normal, Hate_oversample], axis=0)

In [6]:
X = oversampled['tweet']
y = oversampled["label"]

count_vect = CountVectorizer(stop_words='english')
transformer  = TfidfTransformer(norm='l2',sublinear_tf=True)

X_counts = count_vect.fit_transform(X)
X_Vec = transformer.fit_transform(X_counts)

X_Vec = pd.DataFrame.sparse.from_spmatrix(X_Vec)

In [7]:
def Weight(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    N = tn + fp + fn + tp
    C = tn+tp

    E = abs(C-N)/N
    W = 0.5 * log((1-E)/E)
    return(W)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_Vec,y, test_size=0.33, random_state=44, shuffle =True)

In [9]:
W = []
y_predict = []

print("Linear Regression:")
Linear_Regression = LinearRegression()
Linear_Regression.fit(X_train,y_train)
pre = Linear_Regression.predict(X_train)

y1_predict = []
for i in pre:
    if i>0.5:
        y1_predict.append(1)
    else:
        y1_predict.append(0)

print(accuracy_score(y_train,y1_predict))

W.append(Weight(y_train,y1_predict))
y_predict.append(y1_predict)


print("============================================================")
print("Random Forest:")
Random_Forest = RandomForestClassifier(n_estimators=500,max_depth=10,n_jobs=1)
Random_Forest.fit(X_train,y_train)
print(Random_Forest.score(X_train,y_train))

y2_predict = Random_Forest.predict(X_train)
W.append(Weight(y_train,y2_predict))
y_predict.append(y2_predict)


print("============================================================")
print("Naive Bayes:")
Naive_Bayes = MultinomialNB()
Naive_Bayes.fit(X_train,y_train)
print(Naive_Bayes.score(X_train,y_train))

y3_predict = Naive_Bayes.predict(X_train)
W.append(Weight(y_train,y3_predict))
y_predict.append(y3_predict)

print("============================================================")
print("Hard Voting:")
KNeighbors_Hard = KNeighborsClassifier(n_neighbors=10)
Decision_Tree_Hard = DecisionTreeClassifier(criterion = 'gini',max_depth=10,random_state = 33)
SGD_Hard = SGDClassifier(loss='log')
Hard_Voting = VotingClassifier(estimators=[('KN_H', KNeighbors_Hard),('DT_H', Decision_Tree_Hard),
                                           ('SGD_H',SGD_Hard)],voting='hard')

Hard_Voting.fit(X_train,y_train)
print(Hard_Voting.score(X_train,y_train))

y4_predict = Hard_Voting.predict(X_train)
W.append(Weight(y_train,y4_predict))
y_predict.append(y4_predict)


print("============================================================")
print("Soft Voting:")
KNeighbors_Soft = KNeighborsClassifier(n_neighbors=10)
Decision_Tree_Soft = DecisionTreeClassifier(criterion = 'gini',max_depth=10,random_state = 33)
SGD_Soft = SGDClassifier(loss='log')
Soft_Voting = VotingClassifier(estimators=[('KN_S', KNeighbors_Soft),('DT_S', Decision_Tree_Soft),
                                           ('SGD_S',SGD_Soft)],voting='soft')
Soft_Voting.fit(X_train,y_train)
print(Soft_Voting.score(X_train,y_train))

y5_predict = Soft_Voting.predict(X_train)
W.append(Weight(y_train,y5_predict))
y_predict.append(y5_predict)


Linear Regression:
0.9988951386098834
Random Forest:
0.8477551225391724
Naive Bayes:
0.9684612294094014
Hard Voting:
0.9770490156689433
Soft Voting:
0.9804138208115709


In [10]:
X_meta = []

y_predict = asarray(y_predict)
y_predict = y_predict.T

for XX in y_predict:
    tt = []
    for k in range(0,5):
        tt.append(XX[k]*W[k])
    
    X_meta.append(tt)

In [11]:
#Meta = MLPClassifier(hidden_layer_sizes=(1024,128,64), activation='relu',solver='adam', max_iter=200)
Meta = RandomForestClassifier(n_estimators=500, criterion = 'gini',max_depth=10,random_state = 33)
Meta.fit(X_meta,y_train)

RandomForestClassifier(max_depth=10, n_estimators=500, random_state=33)

In [12]:
def SWE_Model():
    yy_predict = []
    
    print("Linear_Regression")
    pre = Linear_Regression.predict(X_test)
    yy1_predict = []
    for i in pre:
        if i>0.5:
            yy1_predict.append(1)
        else:
            yy1_predict.append(0)
    yy_predict.append(yy1_predict)
    
    print("accuracy: ",accuracy_score(y_test,yy1_predict))
    print("precision: ",precision_score(y_test,yy1_predict))
    print("f1: ",f1_score(y_test,yy1_predict))
    print("recall: ",recall_score(y_test,yy1_predict))
    print("confusion_matrix: ",confusion_matrix(y_test,yy1_predict))
    print("============================================================")
    
    
    print("Random_Forest")
    yy2_predict = Random_Forest.predict(X_test)
    yy_predict.append(yy2_predict)
    
    print("accuracy: ",accuracy_score(y_test,yy2_predict))
    print("precision: ",precision_score(y_test,yy2_predict))
    print("f1: ",f1_score(y_test,yy2_predict))
    print("recall: ",recall_score(y_test,yy2_predict))
    print("confusion_matrix: ",confusion_matrix(y_test,yy2_predict))
    print("============================================================")
    
    
    print("Naive_Bayes")
    yy3_predict = Naive_Bayes.predict(X_test)
    yy_predict.append(yy3_predict)
    
    print("accuracy: ",accuracy_score(y_test,yy3_predict))
    print("precision: ",precision_score(y_test,yy3_predict))
    print("f1: ",f1_score(y_test,yy3_predict))
    print("recall: ",recall_score(y_test,yy3_predict))
    print("confusion_matrix: ",confusion_matrix(y_test,yy3_predict))
    print("============================================================")
    
    
    print("Hard_Voting")
    yy4_predict = Hard_Voting.predict(X_test)
    yy_predict.append(yy4_predict)
    
    print("accuracy: ",accuracy_score(y_test,yy4_predict))
    print("precision: ",precision_score(y_test,yy4_predict))
    print("f1: ",f1_score(y_test,yy4_predict))
    print("recall: ",recall_score(y_test,yy4_predict))
    print("confusion_matrix: ",confusion_matrix(y_test,yy4_predict))
    print("============================================================")
    
    
    print("Soft_Voting")
    yy5_predict = Soft_Voting.predict(X_test)
    yy_predict.append(yy5_predict)
    
    print("accuracy: ",accuracy_score(y_test,yy5_predict))
    print("precision: ",precision_score(y_test,yy5_predict))
    print("f1: ",f1_score(y_test,yy5_predict))
    print("recall: ",recall_score(y_test,yy5_predict))
    print("confusion_matrix: ",confusion_matrix(y_test,yy5_predict))

    
    XX_meta = []

    yy_predict = asarray(yy_predict)
    yy_predict = yy_predict.T

    for XX in yy_predict:
        tt = []
        for k in range(0,5):
            tt.append(XX[k]*W[k])

        XX_meta.append(tt)

    return(Meta.predict(XX_meta))

pred_arr = SWE_Model()

Linear_Regression
accuracy:  0.9415783034257749
precision:  0.8970132944818794
f1:  0.9450306983883345
recall:  0.9984796270018245
confusion_matrix:  [[8619 1131]
 [  15 9851]]
Random_Forest
accuracy:  0.8470636215334421
precision:  0.9475880052151239
f1:  0.8289233576642336
recall:  0.7366713967159944
confusion_matrix:  [[9348  402]
 [2598 7268]]
Naive_Bayes
accuracy:  0.9515191680261011
precision:  0.9225519006540904
f1:  0.9534166054371784
recall:  0.9864180012162984
confusion_matrix:  [[8933  817]
 [ 134 9732]]
Hard_Voting
accuracy:  0.9676794453507341
precision:  0.9765641131530043
f1:  0.9675736497545008
recall:  0.9587472126495034
confusion_matrix:  [[9523  227]
 [ 407 9459]]
Soft_Voting
accuracy:  0.970942088091354
precision:  0.9549725920125294
f1:  0.9716163728712279
recall:  0.9888505980133793
confusion_matrix:  [[9290  460]
 [ 110 9756]]


In [13]:
print("accuracy: ",accuracy_score(y_test,pred_arr))
print("precision: ",precision_score(y_test,pred_arr))
print("f1: ",f1_score(y_test,pred_arr))
print("recall: ",recall_score(y_test,pred_arr))
print("confusion_matrix: ",confusion_matrix(y_test,pred_arr))

accuracy:  0.9415783034257749
precision:  0.8970132944818794
f1:  0.9450306983883345
recall:  0.9984796270018245
confusion_matrix:  [[8619 1131]
 [  15 9851]]


In [25]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42344,42345,42346,42347,42348,42349,42350,42351,42352,42353
43978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
Soft_Voting.transform(X_test).shape


(19616, 6)

In [29]:
print(Random_Forest.decision_path(X_test))

(<19616x33384 sparse matrix of type '<class 'numpy.int64'>'
	with 106915959 stored elements in Compressed Sparse Row format>, array([    0,    59,   132,   191,   224,   293,   364,   417,   486,
         561,   612,   709,   782,   849,   910,   983,  1084,  1135,
        1208,  1297,  1370,  1433,  1510,  1581,  1656,  1723,  1816,
        1885,  1966,  2053,  2136,  2219,  2294,  2353,  2422,  2483,
        2548,  2615,  2646,  2743,  2818,  2889,  2952,  2999,  3056,
        3109,  3158,  3209,  3286,  3323,  3358,  3415,  3490,  3581,
        3648,  3711,  3782,  3839,  3892,  3957,  4010,  4075,  4140,
        4199,  4264,  4335,  4380,  4429,  4478,  4541,  4634,  4705,
        4784,  4857,  4942,  5025,  5074,  5125,  5212,  5291,  5370,
        5473,  5512,  5555,  5632,  5685,  5734,  5815,  5888,  5943,
        6018,  6079,  6148,  6255,  6302,  6383,  6480,  6541,  6618,
        6681,  6754,  6849,  6930,  6995,  7066,  7115,  7186,  7251,
        7344,  7413,  7478,  7559,