In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import string
import unicodedata
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [2]:
df = pd.read_csv('TwitterHate.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.shape

(31962, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
df.drop(columns='id',inplace=True)

In [6]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [7]:
df.isna().sum()

label    0
tweet    0
dtype: int64

In [8]:
df['label'].unique()

array([0, 1])

In [9]:
df.label.unique()

array([0, 1])

In [10]:
df['tweet']=df['tweet'].str.lower()
df['tweet'].head()

0     @user when a father is dysfunctional and is s...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model   i love u take with u all the time in ...
4               factsguide: society now    #motivation
Name: tweet, dtype: object

In [11]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
# STOPWORDS = set(stopwords.words('english'))
# def cleaningStopwords(text):
#     return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# df['tweet'] = df['tweet'].apply(lambda text: cleaningStopwords(text))
# df['tweet'].sample(5)

In [13]:
STOPWORDS = set(stopwords.words('english'))
def cleaningStopwords(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in STOPWORDS]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
df['tweet'] = df['tweet'].apply(lambda text: cleaningStopwords(text))
df['tweet'].sample(5)

2738     today beautiful day . time 2 remember beautifu...
29476    @ user good morning rj ji # jiyodilse # salaam...
31467                hi ! # paytoday snapchat : sandy-9791
9261     going miss cheeky birdie next 12 days ! ! ! # ...
3413     beautiful bobo ! love ! # friends # dinner # p...
Name: tweet, dtype: object

In [14]:

english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaningPunctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['tweet'] = df['tweet'].apply(lambda x: cleaningPunctuations(x))
df['tweet'].sample(5)

9218     happy weddingâ¤ï¸  çµå©å¼ wedding  ãå¹¸...
21341    hea  user  porsche  photo  photography  fun  l...
10017    really  dancing street  streetdance  tinydance...
10246                 thankful health   thankful  positive
19563    today notice sounds  health  user  user  user ...
Name: tweet, dtype: object

In [15]:
def cleaningRepeatingChar(text):
    return re.sub(r'(.)1+', r'1', text)
df['tweet'] = df['tweet'].apply(lambda x: cleaningRepeatingChar(x))
df['tweet'].sample(5)

5727      user today going good day ððð¡ð¤ð...
8758      user preparations underway   user  user  user...
26793     user  fridayfeeling tweeters  user  user  che...
18602    ve never seen bigger  happier  smile   bighapp...
25686     user let s vote  ë¹ ì¤  ëì¼  user  060521...
Name: tweet, dtype: object

In [16]:
def cleaningURLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['tweet'] = df['tweet'].apply(lambda x: cleaningURLs(x))
df['tweet'].sample(5)


24544        user thankful new places   thankful  positive
19001    playing  abandoned toys   spiraling sun  music...
19346    chill  ðð » ðð¹  thankful  blessed  ...
24351     marathon bull  dominate bull direct whatever ...
16565     guy ask change  gave pocket lit cigarette wal...
Name: tweet, dtype: object

In [17]:
def cleaningNumbers(data):
    return re.sub('[0-9]+', '', data)
df['tweet'] = df['tweet'].apply(lambda x: cleaningNumbers(x))
df['tweet'].sample(5)

14670       thankful family vacations   thankful  positive
4059     jessica dona asian beauty gives happy ending m...
9293     ca nt secular education haredi life  existed b...
30153     user make  quitting  cigarettes  gift loved o...
19237    little things make fianally got order  india  ...
Name: tweet, dtype: object

In [18]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [19]:
df['tweet'] = df['tweet'].apply(lambda x: remove_accented_chars(x))

In [20]:
df['tweet'].sample(5)

12363    bahoom   funny  humor  jokes  joke  wordplay  ...
10958     user s s happening twitter  m engaged amazing...
14355    handsome guy join anchor desk tonight   amp   ...
26394     user  user exactly  feel m butt end giant com...
11927     germanyhetalia bull  dominate bull direct wha...
Name: tweet, dtype: object

In [21]:
nlp=spacy.load('en_core_web_sm')
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [22]:
df['tweet'] = df['tweet'].apply(lambda x: lemmatize_text(x))

In [23]:
X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2, random_state=42)

In [24]:
X_train.shape

(25569,)

In [25]:
y_train.shape

(25569,)

In [26]:
vec = TfidfVectorizer(stop_words='english')

In [27]:
# Vectorize text reviews to numbers
X_train_vec = vec.fit_transform(X_train).toarray()
X_test_vec = vec.transform(X_test).toarray()

In [28]:
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train_vec, y_train)

RandomForestClassifier(n_estimators=10)

In [29]:
y_pred = classifier.predict(X_test_vec)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[5925   20]
 [ 244  204]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5945
           1       0.91      0.46      0.61       448

    accuracy                           0.96      6393
   macro avg       0.94      0.73      0.79      6393
weighted avg       0.96      0.96      0.95      6393

Accuracy: 0.9587048334115439


In [31]:
adaclassifier = AdaBoostClassifier(n_estimators = 50)
adaclassifier.fit(X_train_vec, y_train)
y_pred = adaclassifier.predict(X_test_vec)
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[5889   56]
 [ 302  146]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5945
           1       0.72      0.33      0.45       448

    accuracy                           0.94      6393
   macro avg       0.84      0.66      0.71      6393
weighted avg       0.94      0.94      0.93      6393

Accuracy: 0.9440012513686845
