In [1]:
import os 
import pandas

In [2]:
DATASET_PATH = './dataset/measuring_hate_speech.csv'

Importing custom operators developed by the Group to Clean, Process and build hate detection model.

<h1> Data Cleaning <h1/>

In [3]:
from cleaning.clean_dataset import CleanTextDatasetOperator

clean_dataset_operator = CleanTextDatasetOperator(
    file_path=DATASET_PATH
)

help(CleanTextDatasetOperator)


Help on class CleanTextDatasetOperator in module cleaning.clean_dataset:

class CleanTextDatasetOperator(builtins.object)
 |  CleanTextDatasetOperator(file_path='', *args, **kwargs)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, file_path='', *args, **kwargs)
 |      This class wraps all data cleaning logic under one shade.
 |      
 |      Args:   
 |          file_path (str) : Path to the text dataset file in csv format.
 |      Returns:
 |          cleaned_df (pandas.DataFrame) : processed cleaned data frame.
 |      
 |      Added By : Abbas Ismail
 |  
 |  expand_contractions(self, df: pandas.core.frame.DataFrame)
 |      Method for expanding compression for text column
 |      
 |      Added By : Simranjeet and Navneet kaur
 |  
 |  load_csv_to_dataset(self)
 |      This method loads CSV dataset to pandas Dataframe
 |      Returns:
 |          df : pandas.DataFrame - dataset dataframe
 |      
 |      Added By : Abbas Ismail
 |  
 |  lowercase_text(self, df: pandas.core.f

In [4]:
#Loading the dataset
dataset_df = clean_dataset_operator.load_csv_to_dataset()
dataset_df.head()

Unnamed: 0,hatespeech,text
0,0.0,Yes indeed. She sort of reminds me of the elde...
1,0.0,The trans women reading this tweet right now i...
2,2.0,Question: These 4 broads who criticize America...
3,0.0,It is about time for all illegals to go back t...
4,2.0,For starters bend over the one in pink and kic...


In [5]:
#changing the text to lowercase 
dataset_df = clean_dataset_operator.lowercase_text(df=dataset_df)
dataset_df.head()

Unnamed: 0,hatespeech,text
0,0.0,yes indeed. she sort of reminds me of the elde...
1,0.0,the trans women reading this tweet right now i...
2,2.0,question: these 4 broads who criticize america...
3,0.0,it is about time for all illegals to go back t...
4,2.0,for starters bend over the one in pink and kic...


In [6]:
#Remove numbers 
dataset_df['text'] = clean_dataset_operator.remove_number(df=dataset_df[['text']])
dataset_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_with_numbers] = df[columns_with_numbers].applymap(lambda x: None if contains_numbers(x) else x)


Unnamed: 0,hatespeech,text
0,0.0,yes indeed. she sort of reminds me of the elde...
1,0.0,the trans women reading this tweet right now i...
2,2.0,
3,0.0,it is about time for all illegals to go back t...
4,2.0,for starters bend over the one in pink and kic...


In [7]:
dataset_df.dropna(inplace=True)

<h1> Feature Engineering <h1/>

In [8]:
from feature_selection.get_features import FeatureSelectDatasetOperator

feature_op = FeatureSelectDatasetOperator(
    cleaned_dataframe=dataset_df
)

help(FeatureSelectDatasetOperator)


Help on class FeatureSelectDatasetOperator in module feature_selection.get_features:

class FeatureSelectDatasetOperator(builtins.object)
 |  FeatureSelectDatasetOperator(cleaned_dataframe: pandas.core.frame.DataFrame, *args, **kwargs)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, cleaned_dataframe: pandas.core.frame.DataFrame, *args, **kwargs)
 |      This class wraps all feature selection logic under one shade.
 |      
 |      Args:   
 |          file_path (str) : Path to the text dataset file in csv format.
 |      Returns:
 |          feature_df (pandas.DataFrame) : gives data frame words to train the model.
 |      
 |      Added By : Sai Kumar Adulla
 |  
 |  create_bow_matrix(self, df: pandas.core.frame.DataFrame, input_col='text', output_col='bow_features', max_features=5000)
 |      The CountVectorizer is used to transform the input DataFrame and generate the BoW features
 |      
 |      Added By :  Christin Paul
 |  
 |  feature_creation(self, df: pandas.core.fram

In [9]:
dataset_df.dropna(inplace=True)

In [12]:
#Process of TF-IDF
feature_op.tfidf(df=dataset_df)

Unnamed: 0,abandon,abc,abeg,ability,able,abort,aborted,abortion,abortions,about,...,yta,yup,zany_face,zealand,zero,zindabad,zionist,zionists,zipper,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
feature_op.feature_creation(df=dataset_df)
dataset_df['hatespeech']

0         0.0
1         0.0
3         0.0
4         2.0
5         0.0
         ... 
135544    0.0
135547    1.0
135548    1.0
135549    0.0
135555    2.0
Name: hatespeech, Length: 117920, dtype: float64

<h1> Model Building and Training <h1/>

In [8]:
from models.model import TrainHateDetectionModel

model_op = TrainHateDetectionModel(
    dataset = dataset_df
)

help(model_op)

Help on TrainHateDetectionModel in module models.model object:

class TrainHateDetectionModel(builtins.object)
 |  TrainHateDetectionModel(dataset: pandas.core.frame.DataFrame, test_size=0.1)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, dataset: pandas.core.frame.DataFrame, test_size=0.1)
 |      This class trains multiple ML models for the 
 |      detecting hate speech and yields list of evalution metrics and prediction methods
 |      Args:
 |          dataset (pandas.DataFrame) - dataset having text and label fields.
 |      
 |      Added by : Abbas Ismail
 |  
 |  buildNavieBayesModel(self)
 |      Builds and Trains the Naive Bayes Model for HateDetection
 |      
 |      Added By :Abbas Ismail
 |  
 |  getTestPrecitions(self)
 |      Run the predcitions on the test sets
 |      
 |      Added By :Sai Kumar Adulla
 |  
 |  prepare_test_train_data(self)
 |      This method splits the dataset into train and test groups
 |      Returns:
 |          df : pandas.DataFrame - 

In [9]:
model_op.test_size

0.1

In [10]:
dataset_df['label'] = dataset_df['hatespeech']

model_op.prepare_test_train_data()

In [11]:
len(model_op.train_data), len(model_op.train_labels), 

(94336, 94336)

In [12]:
len(model_op.test_data), len(model_op.test_labels)

(23584, 23584)

In [13]:
#building the model
model_op.buildNavieBayesModel()

In [14]:
#Getting the precitions
model_op.getTestPrecitions()

In [15]:
#checking the accuracy meterics
model_op.yield_model_accuracy_metrics()

Accuracy: 0.73
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81     13791
         1.0       0.12      0.09      0.10      1558
         2.0       0.70      0.74      0.72      8235

    accuracy                           0.73     23584
   macro avg       0.54      0.54      0.54     23584
weighted avg       0.73      0.73      0.73     23584

Confusion Matrix:
 [[11124   662  2005]
 [  856   143   559]
 [ 1744   438  6053]]


From this metrics, we understand the model's performance and identify areas for improvement, such as addressing the low precision and recall for Class 1. 