# Dataset Used
### Twitter-hate-speech
<a href='https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv' >Download from here</a>

In [33]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv')

Skipping, found downloaded files in ".\twitter-hate-speech" (use force=True to force download)


# Importing Required Libraries

In [34]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [35]:
# use nltk.download()
# nltk.download()
# download all or required ones

In [36]:
lemitizer = nltk.stem.WordNetLemmatizer()
stop_word = stopwords.words('english')

In [37]:
df = pd.read_csv('twitter-hate-speech/train_E6oV3lV.csv')

In [38]:
df = df.drop(columns=['id'],axis=-1)

In [39]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


# Cleaning Data

In [40]:
def clean_tweet(text):
    global lemitizer , stop_word
    text = re.sub(r'@\w*', ' ' , text)
    text = re.sub(r'#\w*', ' ' , text)
    text = re.sub(r"'s\b", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [w  for w in text if w not in stop_word]
    text = [lemitizer.lemmatize(w) for w in text]
    return text

In [41]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [42]:
df.head()

Unnamed: 0,label,tweet
0,0,"[father, dysfunctional, selfish, drag, kid, dy..."
1,0,"[thanks, credit, use, cause, offer, wheelchair..."
2,0,"[bihday, majesty]"
3,0,"[love, u, take, u, time, ur]"
4,0,"[factsguide, society]"


In [43]:
df.isnull().sum()

label    0
tweet    0
dtype: int64

In [44]:
tweets = []
for i in df['tweet']:
    tweets.append(' '.join(i))
label = df['label']

In [45]:
tweets[:5]

['father dysfunctional selfish drag kid dysfunction',
 'thanks credit use cause offer wheelchair van pdx',
 'bihday majesty',
 'love u take u time ur',
 'factsguide society']

# text to vector    COUNT VECTORIZER (BAG OF WORD)

In [46]:
cv = CountVectorizer(max_features=2000)
bow_tweet_vectors = cv.fit_transform(tweets)

# text to vector    TF-IDF

In [47]:
tv = TfidfVectorizer(max_features=2000)
tfidf_tweet_vectors = tv.fit_transform(tweets)

#  text ro vector Word2Vec

In [48]:
model = Word2Vec(df['tweet'],min_count=2)


In [49]:
print(model.wv.index_to_key)



In [50]:
model.wv['day']

array([-0.7805923 ,  0.9131082 ,  0.5113532 ,  0.13570943, -0.12575847,
       -1.7939684 ,  0.45025775,  1.7801104 , -0.7941589 , -1.0572219 ,
        0.15440331, -0.78316766,  0.02230429,  0.36793575,  0.33924127,
       -0.5839232 ,  0.4436622 , -0.6482462 , -0.20646909, -1.6391269 ,
        0.687732  ,  0.61866784,  0.9586177 , -0.29618922,  0.18157165,
        0.2117336 ,  0.16031817, -0.08442115, -0.37667078, -0.01227106,
        0.6447824 , -0.08588237,  0.61494803, -0.5478383 , -0.29019877,
        1.4355892 ,  0.5992756 , -0.4168586 , -0.78226346, -1.2598016 ,
        0.5359572 , -0.5691479 , -0.487071  , -0.02487347,  0.7014963 ,
       -0.5321645 , -0.68495274, -0.04534176,  0.3314181 ,  0.69549024,
        0.41342777, -0.17199855, -0.481749  , -0.0684639 , -0.6801129 ,
        0.19623953,  0.8312727 , -0.2213892 , -1.0210879 ,  0.46699226,
        0.42328498,  0.1430854 ,  0.07069661,  0.13580942, -0.35876992,
        0.71472675,  0.15616572,  0.2422674 , -1.0867012 ,  0.77

### Average sentence vectors 
<a href='https://cs.stanford.edu/~quocle/paragraph_vector.pdf'>Know More About It</a>

In [51]:
def sent_vectorizer(sentence):
    vector_size = model.wv.vector_size
    wv_res = np.zeros(vector_size)
    ctr = 1
    for w in sentence:
        if w in model.wv:
            ctr+=1
            wv_res+=model.wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [52]:
sent_vectorizer('I am happy')

array([-0.07538381,  0.17625958,  0.16022864,  0.02902961,  0.11073684,
       -0.33069659,  0.0221958 ,  0.51074084, -0.17517995, -0.16041493,
       -0.14331925, -0.39766389, -0.06290065,  0.08625665,  0.00206879,
       -0.1972918 ,  0.02541585, -0.26063029, -0.0253872 , -0.50327716,
        0.01344187,  0.0629345 ,  0.10593651, -0.10128224, -0.05800567,
       -0.0111932 , -0.26898278, -0.19619687, -0.23841169, -0.05803348,
        0.31554294,  0.07734561,  0.11383935, -0.10793112, -0.17392306,
        0.22541393, -0.08812436, -0.2222034 , -0.11379677, -0.43612817,
        0.01796007, -0.19255789, -0.18526494,  0.0027922 ,  0.19736383,
       -0.05319177, -0.25030319, -0.0369911 ,  0.15559051,  0.19016616,
        0.07150945, -0.21702308, -0.09077368,  0.1063291 , -0.05925108,
        0.14854359,  0.1860938 , -0.10974165, -0.03228606,  0.04697086,
        0.05548295,  0.11086507, -0.01546208,  0.05479263, -0.26190745,
        0.23490331,  0.06773121,  0.09944506, -0.31649368,  0.32

In [53]:
w2v_tweet_vectors=[]
for sent in tqdm(tweets):
    w2v_tweet_vectors.append(sent_vectorizer(sent))

100%|██████████| 31962/31962 [00:02<00:00, 14601.61it/s]


# Comparing various algorithms

In [54]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()

models = [
    [lr,'LogisticRegression'],
    [dt,'DecisionTreeClassifier'],
    [knn,'KNeighborsClassifier'],
    [rf,'RandomForestClassifier']
]

In [55]:
def run(x,y,models):
    x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2)
    accuracy=[]
    for model in tqdm(models):
        model[0].fit(x_train,y_train)
        score = model[0].score(x_test,y_test)
        score = float("{:.4f}".format(score))
        accuracy.append([model[1],score])
    return accuracy


In [56]:
bow_accuracy = run(bow_tweet_vectors,label,models)
tfidf_accuracy = run(tfidf_tweet_vectors,label,models)
w2v_accuracy = run(w2v_tweet_vectors,label,models)

100%|██████████| 4/4 [00:31<00:00,  7.89s/it]
100%|██████████| 4/4 [00:30<00:00,  7.67s/it]
100%|██████████| 4/4 [00:42<00:00, 10.58s/it]


In [57]:
print(bow_accuracy)
print(tfidf_accuracy)
print(w2v_accuracy)

[['LogisticRegression', 0.9482], ['DecisionTreeClassifier', 0.9329], ['KNeighborsClassifier', 0.9459], ['RandomForestClassifier', 0.9454]]
[['LogisticRegression', 0.9434], ['DecisionTreeClassifier', 0.9366], ['KNeighborsClassifier', 0.9373], ['RandomForestClassifier', 0.9484]]
[['LogisticRegression', 0.9329], ['DecisionTreeClassifier', 0.8888], ['KNeighborsClassifier', 0.9302], ['RandomForestClassifier', 0.9481]]


In [58]:
accuracy = {'model_name':[i[1] for i in models],
            'bow_accuracy':[str(i[1]*100)+'%' for i in bow_accuracy],
            'tfidf_accuracy':[str(i[1]*100)+'%' for i in tfidf_accuracy],
            'w2v_accuracy':[str(i[1]*100)+'%' for i in w2v_accuracy]
            }

accuracy_data = pd.DataFrame.from_dict(accuracy)

In [59]:
accuracy_data.head()

Unnamed: 0,model_name,bow_accuracy,tfidf_accuracy,w2v_accuracy
0,LogisticRegression,94.82000000000001%,94.34%,93.28999999999999%
1,DecisionTreeClassifier,93.28999999999999%,93.66%,88.88000000000001%
2,KNeighborsClassifier,94.59%,93.73%,93.02%
3,RandomForestClassifier,94.54%,94.84%,94.81%
