# Dataset Used
### Twitter-hate-speech
<a href='https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv' >Download from here</a>

In [165]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv')

Skipping, found downloaded files in ".\twitter-hate-speech" (use force=True to force download)


# Importing Required Libraries

In [166]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [167]:
# use nltk.download()
# nltk.download()
# download all or required ones

In [168]:
lemitizer = nltk.stem.WordNetLemmatizer()
stop_word = stopwords.words('english')

In [169]:
df = pd.read_csv('twitter-hate-speech/train_E6oV3lV.csv')

In [170]:
df = df.drop(columns=['id'],axis=-1)

In [171]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


# Cleaning Data

In [172]:
def clean_tweet(text):
    global lemitizer , stop_word
    text = re.sub(r'@\w*', ' ' , text)
    text = re.sub(r'#\w*', ' ' , text)
    text = re.sub(r"'s\b", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [w  for w in text if w not in stop_word]
    text = [lemitizer.lemmatize(w) for w in text]
    return text

In [173]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [174]:
df.head()

Unnamed: 0,label,tweet
0,0,"[father, dysfunctional, selfish, drag, kid, dy..."
1,0,"[thanks, credit, use, cause, offer, wheelchair..."
2,0,"[bihday, majesty]"
3,0,"[love, u, take, u, time, ur]"
4,0,"[factsguide, society]"


In [175]:
df.isnull().sum()

label    0
tweet    0
dtype: int64

In [176]:
tweets = []
for i in df['tweet']:
    tweets.append(' '.join(i))
label = df['label']

In [177]:
tweets[:5]

['father dysfunctional selfish drag kid dysfunction',
 'thanks credit use cause offer wheelchair van pdx',
 'bihday majesty',
 'love u take u time ur',
 'factsguide society']

# text to vector    COUNT VECTORIZER (BAG OF WORD)

In [178]:
cv = CountVectorizer(max_features=2000)
bow_tweet_vectors = cv.fit_transform(tweets)

# text to vector    TF-IDF

In [179]:
tv = TfidfVectorizer(max_features=2000)
tfidf_tweet_vectors = tv.fit_transform(tweets)

# Comparing various algorithms

In [180]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()

models = [
    [lr,'LogisticRegression'],
    [dt,'DecisionTreeClassifier'],
    [knn,'KNeighborsClassifier'],
    [rf,'RandomForestClassifier']
]

In [181]:
def run(x,y,models):
    x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2)
    accuracy=[]
    for model in tqdm(models):
        model[0].fit(x_train,y_train)
        score = model[0].score(x_test,y_test)
        score = float("{:.4f}".format(score))
        accuracy.append([model[1],score])
    return accuracy


In [182]:
bow_accuracy = run(bow_tweet_vectors,label,models)
tfidf_accuracy = run(tfidf_tweet_vectors,label,models)

100%|██████████| 4/4 [00:31<00:00,  7.93s/it]
100%|██████████| 4/4 [00:33<00:00,  8.46s/it]


In [183]:
bow_accuracy,tfidf_accuracy

([['LogisticRegression', 0.9395],
  ['DecisionTreeClassifier', 0.927],
  ['KNeighborsClassifier', 0.9357],
  ['RandomForestClassifier', 0.9393]],
 [['LogisticRegression', 0.9465],
  ['DecisionTreeClassifier', 0.9406],
  ['KNeighborsClassifier', 0.9435],
  ['RandomForestClassifier', 0.9523]])

In [184]:
accuracy = {'model_name':[i[1] for i in models],
            'bow_accuracy':[str(i[1]*100)+'%' for i in bow_accuracy],
            'tfidf_accuracy':[str(i[1]*100)+'%' for i in tfidf_accuracy]
            }

accuracy_data = pd.DataFrame.from_dict(accuracy)

In [185]:
accuracy_data.head()

Unnamed: 0,model_name,bow_accuracy,tfidf_accuracy
0,LogisticRegression,93.95%,94.65%
1,DecisionTreeClassifier,92.7%,94.06%
2,KNeighborsClassifier,93.57%,94.35%
3,RandomForestClassifier,93.93%,95.23%
