In [2]:
import numpy as np 
import pandas as pd 

import os
from pathlib import Path

import string
import nltk
from nltk.corpus import stopwords

import scipy.io
import scipy.linalg
from scipy.sparse import csr_matrix, vstack, lil_matrix 
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

import plotly.express as px
import plotly.figure_factory as ff
from yellowbrick.text import TSNEVisualizer

In [3]:
DATASET_PATH = './measuring_hate_speech.csv'

# Data Cleaning

In [6]:
from cleaning.clean_dataset import CleanTextDatasetOperator

clean_dataset_operator = CleanTextDatasetOperator(
    file_path=DATASET_PATH
)

help(CleanTextDatasetOperator)

Help on class CleanTextDatasetOperator in module cleaning.clean_dataset:

class CleanTextDatasetOperator(builtins.object)
 |  CleanTextDatasetOperator(file_path='', *args, **kwargs)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, file_path='', *args, **kwargs)
 |      This class wraps all data cleaning logic under one shade.
 |      
 |      Args:   
 |          file_path (str) : Path to the text dataset file in csv format.
 |      Returns:
 |          cleaned_df (pandas.DataFrame) : processed cleaned data frame.
 |      
 |      Added By : Abbas Ismail
 |  
 |  expand_contractions(self, df: pandas.core.frame.DataFrame)
 |      Method for expanding compression for text column
 |      
 |      Added By : Simranjeet and Navneet kaur
 |  
 |  load_csv_to_dataset(self)
 |      This method loads CSV dataset to pandas Dataframe
 |      Returns:
 |          df : pandas.DataFrame - dataset dataframe
 |      
 |      Added By : Abbas Ismail
 |  
 |  lowercase_text(self, df: pandas.core.f

In [7]:
#Loading the dataset
dataset_df = clean_dataset_operator.load_csv_to_dataset()
dataset_df.head(10)

Unnamed: 0,hatespeech,text,sentiment,respect,insult,humiliate,dehumanize,violence,genocide,attack_defend
0,0.0,Yes indeed. She sort of reminds me of the elde...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,The trans women reading this tweet right now i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2.0,Question: These 4 broads who criticize America...,4.0,4.0,4.0,4.0,4.0,0.0,0.0,4.0
3,0.0,It is about time for all illegals to go back t...,2.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0
4,2.0,For starters bend over the one in pink and kic...,4.0,4.0,4.0,4.0,4.0,4.0,1.0,3.0
5,0.0,Sounds like the kinda wholsesome life I'd die ...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
6,2.0,Who the fuck is this insignificant simple mind...,4.0,4.0,4.0,4.0,3.0,0.0,0.0,3.0
7,2.0,Fuck off you insufferable retarded faggot.,4.0,4.0,4.0,4.0,0.0,0.0,0.0,3.0
8,0.0,"OMG! The EGO's of these young, young, inexperi...",4.0,4.0,4.0,3.0,3.0,0.0,1.0,1.0
9,2.0,"Worthless whore, these tits with look nice wit...",4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0


In [8]:
#changing the text to lowercase 
dataset_df = clean_dataset_operator.lowercase_text(df=dataset_df)
dataset_df.head(10)

Unnamed: 0,hatespeech,text,sentiment,respect,insult,humiliate,dehumanize,violence,genocide,attack_defend
0,0.0,yes indeed. she sort of reminds me of the elde...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,the trans women reading this tweet right now i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2.0,question: these 4 broads who criticize america...,4.0,4.0,4.0,4.0,4.0,0.0,0.0,4.0
3,0.0,it is about time for all illegals to go back t...,2.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0
4,2.0,for starters bend over the one in pink and kic...,4.0,4.0,4.0,4.0,4.0,4.0,1.0,3.0
5,0.0,sounds like the kinda wholsesome life i'd die ...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
6,2.0,who the fuck is this insignificant simple mind...,4.0,4.0,4.0,4.0,3.0,0.0,0.0,3.0
7,2.0,fuck off you insufferable retarded faggot.,4.0,4.0,4.0,4.0,0.0,0.0,0.0,3.0
8,0.0,"omg! the ego's of these young, young, inexperi...",4.0,4.0,4.0,3.0,3.0,0.0,1.0,1.0
9,2.0,"worthless whore, these tits with look nice wit...",4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0


In [9]:
#Remove numbers 
dataset_df['text'] = clean_dataset_operator.remove_number(df=dataset_df[['text']])
dataset_df.dropna(inplace=True)
dataset_df.head(10)

  df[columns_with_numbers] = df[columns_with_numbers].applymap(lambda x: None if contains_numbers(x) else x)


Unnamed: 0,hatespeech,text,sentiment,respect,insult,humiliate,dehumanize,violence,genocide,attack_defend
0,0.0,yes indeed. she sort of reminds me of the elde...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,the trans women reading this tweet right now i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,it is about time for all illegals to go back t...,2.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0
4,2.0,for starters bend over the one in pink and kic...,4.0,4.0,4.0,4.0,4.0,4.0,1.0,3.0
5,0.0,sounds like the kinda wholsesome life i'd die ...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
6,2.0,who the fuck is this insignificant simple mind...,4.0,4.0,4.0,4.0,3.0,0.0,0.0,3.0
7,2.0,fuck off you insufferable retarded faggot.,4.0,4.0,4.0,4.0,0.0,0.0,0.0,3.0
8,0.0,"omg! the ego's of these young, young, inexperi...",4.0,4.0,4.0,3.0,3.0,0.0,1.0,1.0
9,2.0,"worthless whore, these tits with look nice wit...",4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0
11,0.0,"instagram refugees lmao, let's build a wall in...",3.0,3.0,3.0,3.0,2.0,1.0,1.0,2.0


In [10]:
#Remove numbers 
nltk.download('stopwords')
dataset_df['text'] = clean_dataset_operator.remove_stopwords(df=dataset_df[['text']])
dataset_df.dropna(inplace=True)
dataset_df.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saiku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,hatespeech,text,sentiment,respect,insult,humiliate,dehumanize,violence,genocide,attack_defend
0,0.0,yes indeed . sort reminds elder lady played pa...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,trans women reading tweet right beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,time illegals go back country origin keep free...,2.0,3.0,2.0,1.0,0.0,0.0,0.0,3.0
4,2.0,starters bend one pink kick ass pussy get tast...,4.0,4.0,4.0,4.0,4.0,4.0,1.0,3.0
5,0.0,sounds like kinda wholsesome life 'd die ❤️ ne...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
6,2.0,fuck insignificant simple minded redneck ? get...,4.0,4.0,4.0,4.0,3.0,0.0,0.0,3.0
7,2.0,fuck insufferable retarded faggot .,4.0,4.0,4.0,4.0,0.0,0.0,0.0,3.0
8,0.0,"omg ! ego 's young , young , inexperienced wom...",4.0,4.0,4.0,3.0,3.0,0.0,1.0,1.0
9,2.0,"worthless whore , tits look nice bite marks cum",4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0
11,0.0,"instagram refugees lmao , let 's build wall in...",3.0,3.0,3.0,3.0,2.0,1.0,1.0,2.0


In [11]:
nltk.download('punkt')
def get_summary(df):   

    content = df["text"].values        
    word_tok = [word.lower() for item in content for word in nltk.word_tokenize(item)]    
    st_words = set(word_tok)   
    
    fact = {
        "TotalCount": len(content),
        "TotalWords": len(word_tok),        
        "TotalUniqueWords": len(st_words),
        "MeanWordsPerTweet": len(word_tok) / len(content),
    }

    return fact, df.describe()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saiku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
f, s = get_summary(dataset_df)

In [13]:
f

{'TotalCount': 117920,
 'TotalWords': 2124197,
 'TotalUniqueWords': 40115,
 'MeanWordsPerTweet': 18.013882293080055}

In [14]:
s

Unnamed: 0,hatespeech,sentiment,respect,insult,humiliate,dehumanize,violence,genocide,attack_defend
count,117920.0,117920.0,117920.0,117920.0,117920.0,117920.0,117920.0,117920.0,117920.0
mean,0.757378,2.956852,2.839086,2.578333,2.296158,1.863526,1.04531,0.663891,2.634218
std,0.935394,1.242679,1.318359,1.394268,1.375813,1.410096,1.341694,1.165313,1.119751
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0
50%,0.0,3.0,3.0,3.0,3.0,2.0,0.0,0.0,3.0
75%,2.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0
max,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
