In [1]:
import string
from collections import defaultdict
import operator
import gc
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
stopword=set(stopwords.words('english'))
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer

import keras
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,SpatialDropout1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Sequential

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/javier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Index

1. [Importing Data](#Data)
2. [Data Analysis](../Analysis/train_data_analysis.ipynb)
3. [Embeddings and Text Cleaning](#Embeddings-and-Text-Cleaning)
4. [Preprocessing](#Preprocessing)
5. [Models](#Models)
6. [Real Test](#Real-Test)

# Data

All Kaggle dataframes are preprocessed and merged to create a single training set. This is done to improve the performance of the machine learning model by providing more data to train on.

## Table 1

In [14]:
def preprocess_hate1(df_hate1):
    # Rename the columns
    df_hate1 = df_hate1.rename(columns={'tweet': 'text', 'class': 'target'})
    df_hate1['id'] = df_hate1.index
    # Create the 'target' column
    df_hate1['target'] = df_hate1['hate_speech'].apply(lambda x: 1 if x >= 1 else 0)
    # Drop unnecessary columns
    df_hate1 = df_hate1.drop(['Unnamed: 0', 'offensive_language', 'hate_speech', 'count', 'neither'], axis=1)
    # Reorder the columns
    df_hate1 = df_hate1.reindex(columns=['id', 'text', 'target'])
    
    return df_hate1

In [15]:
df1 = pd.read_csv('../../kaggle/hate.csv')
df1

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24777,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24778,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24779,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24780,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [16]:
df1 = preprocess_hate1(df1)
df1

Unnamed: 0,id,text,target
0,0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0
2,2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0
3,3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0
4,4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0
...,...,...,...
24777,24777,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,0
24778,24778,"you've gone and broke the wrong heart baby, an...",0
24779,24779,young buck wanna eat!!.. dat nigguh like I ain...,0
24780,24780,youu got wild bitches tellin you lies,0


## Table 2

In [17]:
def preprocess_hate2(df_hate2):
    # Rename the columns
    df_hate2 = df_hate2.rename(columns={'comment': 'text', 'isHate': 'target'})
    df_hate2['id'] = df_hate2.index
    # Hate column to int
    df_hate2['target'] = df_hate2['target'].astype(int)
    # Reorder the columns
    df_hate2 = df_hate2.reindex(columns=['id', 'text', 'target'])
    
    return df_hate2

In [18]:
df2 = pd.read_csv('../../kaggle/hate2.csv', sep=';')
df2

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0
...,...,...
993,From the midnight sun where the hot springs blow,0.0
994,Don't say I'm not your type,0.0
995,And therefore never send to know for whom the...,0.0
996,And I can't stand another day,0.0


In [19]:
df2 = preprocess_hate2(df2)
df2

Unnamed: 0,id,text,target
0,0,You should know women's sports are a joke,1
1,1,You look like Sloth with deeper Down’s syndrome,1
2,2,You look like Russian and speak like Indian. B...,1
3,3,"Women deserve to be abused, I guess.",1
4,4,Women are made for making babies and cooking d...,1
...,...,...,...
993,993,From the midnight sun where the hot springs blow,0
994,994,Don't say I'm not your type,0
995,995,And therefore never send to know for whom the...,0
996,996,And I can't stand another day,0


## Tabla 3

In [20]:
def preprocess_hate3(df_hate3):
    columns = ['hatespeech', 'text']
    df_hate3 = df_hate3.loc[:, columns]
    df_hate3['id'] = df_hate3.index
    # Rename the columns
    df_hate3 = df_hate3.rename(columns={'hatespeech': 'target'})
    # Hate column to int
    df_hate3['target'] = df_hate3['target'].astype(int)
    # Format the 'target' column
    df_hate3['target'] = df_hate3['target'].apply(lambda x: 1 if x >= 1 else 0)
    # Reorder the columns
    df_hate3 = df_hate3.reindex(columns=['id', 'text', 'target'])
    
    return df_hate3

In [21]:
df3 = pd.read_csv('../../kaggle/hate3.csv')
df3

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,genocide,attack_defend,hatespeech,hate_speech_score,text,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,target_race_asian,target_race_black,target_race_latinx,target_race_middle_eastern,target_race_native_american,target_race_pacific_islander,target_race_white,target_race_other,target_race,target_religion_atheist,target_religion_buddhist,target_religion_christian,target_religion_hindu,target_religion_jewish,target_religion_mormon,target_religion_muslim,target_religion_other,target_religion,target_origin_immigrant,target_origin_migrant_worker,target_origin_specific_country,target_origin_undocumented,target_origin_other,target_origin,target_gender_men,target_gender_non_binary,target_gender_transgender_men,target_gender_transgender_unspecified,target_gender_transgender_women,target_gender_women,target_gender_other,target_gender,target_sexuality_bisexual,target_sexuality_gay,target_sexuality_lesbian,target_sexuality_straight,target_sexuality_other,target_sexuality,target_age_children,target_age_teenagers,target_age_young_adults,target_age_middle_aged,target_age_seniors,target_age_other,target_age,target_disability_physical,target_disability_cognitive,target_disability_neurological,target_disability_visually_impaired,target_disability_hearing_impaired,target_disability_unspecific,target_disability_other,target_disability,annotator_gender,annotator_trans,annotator_educ,annotator_income,annotator_ideology,annotator_gender_men,annotator_gender_women,annotator_gender_non_binary,annotator_gender_prefer_not_to_say,annotator_gender_self_describe,annotator_transgender,annotator_cisgender,annotator_transgender_prefer_not_to_say,annotator_education_some_high_school,annotator_education_high_school_grad,annotator_education_some_college,annotator_education_college_grad_aa,annotator_education_college_grad_ba,annotator_education_professional_degree,annotator_education_masters,annotator_education_phd,annotator_income_<10k,annotator_income_10k-50k,annotator_income_50k-100k,annotator_income_100k-200k,annotator_income_>200k,annotator_ideology_extremeley_conservative,annotator_ideology_conservative,annotator_ideology_slightly_conservative,annotator_ideology_neutral,annotator_ideology_slightly_liberal,annotator_ideology_liberal,annotator_ideology_extremeley_liberal,annotator_ideology_no_opinion,annotator_race_asian,annotator_race_black,annotator_race_latinx,annotator_race_middle_eastern,annotator_race_native_american,annotator_race_pacific_islander,annotator_race_white,annotator_race_other,annotator_age,annotator_religion_atheist,annotator_religion_buddhist,annotator_religion_christian,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,-3.90,Yes indeed. She sort of reminds me of the elde...,0.81,1.88,0.36,0.34,1.35,1.23,-1.130178,True,True,True,True,True,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,male,no,college_grad_ba,<10k,neutral,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,25.0,False,False,True,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,-6.52,The trans women reading this tweet right now i...,0.96,0.43,-0.35,1.00,0.57,0.42,-1.146973,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,female,no,some_college,<10k,neutral,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,30.0,False,False,True,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,0.0,4.0,2.0,0.36,Question: These 4 broads who criticize America...,0.90,0.74,-0.48,0.31,1.16,0.67,-0.207369,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,male,no,some_college,100k-200k,slightly_conservative,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,41.0,False,False,False,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,0.0,3.0,0.0,0.26,It is about time for all illegals to go back t...,0.81,0.68,0.86,0.29,1.18,0.87,-0.055536,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,male,no,high_school_grad,10k-50k,neutral,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,42.0,True,False,False,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,3.0,2.0,1.54,For starters bend over the one in pink and kic...,0.76,0.63,-0.54,0.51,0.95,1.00,0.104225,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,female,no,masters,10k-50k,neutral,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,27.0,False,False,True,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135551,37080,8590,2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,-4.88,عاجل سماحة #السيد_عبدالملك_بدرالدين_الحوثي نص...,0.16,0.11,-1.07,0.69,0.98,0.57,-0.036250,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,female,no,college_grad_aa,10k-50k,no_opinion,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,56.0,False,False,True,False,False,False,False,False,False,False,False,True,False
135552,22986,8303,2,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,-4.40,Millions of #Yemen-is participated in mass ral...,1.30,0.43,-0.55,0.35,1.12,2.15,-0.974684,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,female,no,college_grad_ba,10k-50k,extremely_liberal,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,28.0,False,False,False,False,False,False,False,False,True,True,False,False,False
135553,21008,6207,2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-2.49,@AbeShinzo @realDonaldTrump @shinzoabe 独裁者は行きま...,1.11,1.29,0.16,0.37,1.46,1.05,-0.491960,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,female,no,some_college,10k-50k,liberal,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,47.0,False,False,True,False,False,False,False,False,False,False,False,True,False
135554,22986,7886,2,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,-4.40,Millions of #Yemen-is participated in mass ral...,1.30,0.43,-0.48,0.35,1.00,0.75,-0.974684,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,male,no,college_grad_aa,10k-50k,conservative,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,37.0,False,False,False,False,False,False,False,True,False,False,False,True,False


In [22]:
df3 = preprocess_hate3(df3)
df3

Unnamed: 0,id,text,target
0,0,Yes indeed. She sort of reminds me of the elde...,0
1,1,The trans women reading this tweet right now i...,0
2,2,Question: These 4 broads who criticize America...,1
3,3,It is about time for all illegals to go back t...,0
4,4,For starters bend over the one in pink and kic...,1
...,...,...,...
135551,135551,عاجل سماحة #السيد_عبدالملك_بدرالدين_الحوثي نص...,0
135552,135552,Millions of #Yemen-is participated in mass ral...,0
135553,135553,@AbeShinzo @realDonaldTrump @shinzoabe 独裁者は行きま...,0
135554,135554,Millions of #Yemen-is participated in mass ral...,0


## Table Join

In [23]:
data = pd.concat([df1, df2, df3])
data 

Unnamed: 0,id,text,target
0,0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0
2,2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0
3,3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0
4,4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0
...,...,...,...
135551,135551,عاجل سماحة #السيد_عبدالملك_بدرالدين_الحوثي نص...,0
135552,135552,Millions of #Yemen-is participated in mass ral...,0
135553,135553,@AbeShinzo @realDonaldTrump @shinzoabe 独裁者は行きま...,0
135554,135554,Millions of #Yemen-is participated in mass ral...,0


In [24]:
data.isnull().sum()

id        0
text      0
target    0
dtype: int64

# Embeddings and Text Cleaning
After the [analysis](./analisisDatosTrain.ipynb) has been performed, the next step is to start the embeddings and text cleaning stage.

## Embeddings

The embeddings stage will create vector representations of the words in the dataset. This will allow us to measure the similarity between words and phrases, which will be useful for tasks such as text classification and hate analysis.

In [32]:
%%time

glove_embeddings = np.load('../../embeddings/glove.840B.300d.pkl', allow_pickle=True)
fasttext_embeddings = np.load('../../embeddings/crawl-300d-2M.pkl', allow_pickle=True)

CPU times: user 5.14 s, sys: 3.1 s, total: 8.24 s
Wall time: 10.9 s


In [33]:
def build_vocab(X):
    
    tweets = X.apply(lambda s: s.split()).values      
    vocab = {}
    
    for tweet in tweets:
        for word in tweet:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1                
    return vocab


def check_embeddings_coverage(X, embeddings):
    
    vocab = build_vocab(X)    
    
    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_oov, vocab_coverage, text_coverage

train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(data['text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_glove_vocab_coverage, train_glove_text_coverage))

train_fasttext_oov, train_fasttext_vocab_coverage, train_fasttext_text_coverage = check_embeddings_coverage(data['text'], fasttext_embeddings)
print('FastText Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_fasttext_vocab_coverage, train_fasttext_text_coverage))

GloVe Embeddings cover 36.14% of vocabulary and 89.31% of text in Training Set
FastText Embeddings cover 38.46% of vocabulary and 90.51% of text in Training Set


## Text Cleaning

The `clean(text)` function takes a text as input and performs a series of transformations to clean and normalize the text. Here is a brief explanation of what each step does:

* Converts the text to lowercase.
* Removes special characters and replaces specific characters with others.
* Replaces common contractions with their full forms.
* Replaces character entity references such as ">" and "<".
* Corrects typos, slang, and informal abbreviations.
* Replaces hashtags and social media usernames with their full forms or descriptions.
* Cleans and normalizes words related to natural disasters, current events, and other specific words.
* Performs additional corrections and normalizations in the text.
* Removes URLs starting with "http://" or "https://" followed by alphanumeric characters.
* Replaces each punctuation and special character with a space and the punctuation/special character itself.
* Replaces specific acronyms with their expanded forms or related terms.
* Removes Tags, Links, and Punctuation.
* Does Stopword Removal and Stemming

In [2]:
def clean(text): 
    text = str(text).lower()

    # Special characters
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    # Contractions
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"We're", "We are", text)
    text = re.sub(r"That's", "That is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"Can't", "Cannot", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"don\x89Ûªt", "do not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"What's", "What is", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"You're", "You are", text)
    text = re.sub(r"I'M", "I am", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"I\x89Ûªm", "I am", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r"Isn't", "is not", text)
    text = re.sub(r"Here's", "Here is", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"you\x89Ûªve", "you have", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"it\x89Ûªs", "it is", text)
    text = re.sub(r"doesn\x89Ûªt", "does not", text)
    text = re.sub(r"It\x89Ûªs", "It is", text)
    text = re.sub(r"Here\x89Ûªs", "Here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"I\x89Ûªve", "I have", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"can\x89Ûªt", "cannot", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"it'll", "it will", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"wouldn\x89Ûªt", "would not", text)
    text = re.sub(r"We've", "We have", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"Y'all", "You all", text)
    text = re.sub(r"Weren't", "Were not", text)
    text = re.sub(r"Didn't", "Did not", text)
    text = re.sub(r"they'll", "they will", text)
    text = re.sub(r"they'd", "they would", text)
    text = re.sub(r"DON'T", "DO NOT", text)
    text = re.sub(r"That\x89Ûªs", "That is", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"You\x89Ûªre", "You are", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"Don\x89Ûªt", "Do not", text)
    text = re.sub(r"we'd", "we would", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"They're", "They are", text)
    text = re.sub(r"Can\x89Ûªt", "Cannot", text)
    text = re.sub(r"you\x89Ûªll", "you will", text)
    text = re.sub(r"I\x89Ûªd", "I would", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"i've", "I have", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"I've", "I have", text)
    text = re.sub(r"Don't", "do not", text)
    text = re.sub(r"I'll", "I will", text)
    text = re.sub(r"I'd", "I would", text)
    text = re.sub(r"Let's", "Let us", text)
    text = re.sub(r"you'd", "You would", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"Ain't", "am not", text)
    text = re.sub(r"Haven't", "Have not", text)
    text = re.sub(r"Could've", "Could have", text)
    text = re.sub(r"youve", "you have", text)  
    text = re.sub(r"donå«t", "do not", text)   
            
    # Character entity references
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)
    
    # Typos, slang and informal abbreviations
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"w/", "with", text)
    text = re.sub(r"USAgov", "USA government", text)
    text = re.sub(r"recentlu", "recently", text)
    text = re.sub(r"Ph0tos", "Photos", text)
    text = re.sub(r"amirite", "am I right", text)
    text = re.sub(r"exp0sed", "exposed", text)
    text = re.sub(r"<3", "love", text)
    text = re.sub(r"amageddon", "armageddon", text)
    text = re.sub(r"Trfc", "Traffic", text)
    text = re.sub(r"8/5/2015", "2015-08-05", text)
    text = re.sub(r"WindStorm", "Wind Storm", text)
    text = re.sub(r"8/6/2015", "2015-08-06", text)
    text = re.sub(r"10:38PM", "10:38 PM", text)
    text = re.sub(r"10:30pm", "10:30 PM", text)
    text = re.sub(r"16yr", "16 year", text)
    text = re.sub(r"lmao", "laughing my ass off", text)   
    text = re.sub(r"TRAUMATISED", "traumatized", text)
    
    # Hashtags and usernames
    text = re.sub(r"IranDeal", "Iran Deal", text)
    text = re.sub(r"ArianaGrande", "Ariana Grande", text)
    text = re.sub(r"camilacabello97", "camila cabello", text) 
    text = re.sub(r"RondaRousey", "Ronda Rousey", text)     
    text = re.sub(r"MTVHottest", "MTV Hottest", text)
    text = re.sub(r"TrapMusic", "Trap Music", text)
    text = re.sub(r"ProphetMuhammad", "Prophet Muhammad", text)
    text = re.sub(r"PantherAttack", "Panther Attack", text)
    text = re.sub(r"StrategicPatience", "Strategic Patience", text)
    text = re.sub(r"socialnews", "social news", text)
    text = re.sub(r"NASAHurricane", "NASA Hurricane", text)
    text = re.sub(r"onlinecommunities", "online communities", text)
    text = re.sub(r"humanconsumption", "human consumption", text)
    text = re.sub(r"Typhoon-Devastated", "Typhoon Devastated", text)
    text = re.sub(r"Meat-Loving", "Meat Loving", text)
    text = re.sub(r"facialabuse", "facial abuse", text)
    text = re.sub(r"LakeCounty", "Lake County", text)
    text = re.sub(r"BeingAuthor", "Being Author", text)
    text = re.sub(r"withheavenly", "with heavenly", text)
    text = re.sub(r"thankU", "thank you", text)
    text = re.sub(r"iTunesMusic", "iTunes Music", text)
    text = re.sub(r"OffensiveContent", "Offensive Content", text)
    text = re.sub(r"WorstSummerJob", "Worst Summer Job", text)
    text = re.sub(r"HarryBeCareful", "Harry Be Careful", text)
    text = re.sub(r"NASASolarSystem", "NASA Solar System", text)
    text = re.sub(r"animalrescue", "animal rescue", text)
    text = re.sub(r"KurtSchlichter", "Kurt Schlichter", text)
    text = re.sub(r"aRmageddon", "armageddon", text)
    text = re.sub(r"Throwingknifes", "Throwing knives", text)
    text = re.sub(r"GodsLove", "God's Love", text)
    text = re.sub(r"bookboost", "book boost", text)
    text = re.sub(r"ibooklove", "I book love", text)
    text = re.sub(r"NestleIndia", "Nestle India", text)
    text = re.sub(r"realDonaldTrump", "Donald Trump", text)
    text = re.sub(r"DavidVonderhaar", "David Vonderhaar", text)
    text = re.sub(r"CecilTheLion", "Cecil The Lion", text)
    text = re.sub(r"weathernetwork", "weather network", text)
    text = re.sub(r"withBioterrorism&use", "with Bioterrorism & use", text)
    text = re.sub(r"Hostage&2", "Hostage & 2", text)
    text = re.sub(r"GOPDebate", "GOP Debate", text)
    text = re.sub(r"RickPerry", "Rick Perry", text)
    text = re.sub(r"frontpage", "front page", text)
    text = re.sub(r"NewsIntexts", "News In texts", text)
    text = re.sub(r"ViralSpell", "Viral Spell", text)
    text = re.sub(r"til_now", "until now", text)
    text = re.sub(r"volcanoinRussia", "volcano in Russia", text)
    text = re.sub(r"ZippedNews", "Zipped News", text)
    text = re.sub(r"MicheleBachman", "Michele Bachman", text)
    text = re.sub(r"53inch", "53 inch", text)
    text = re.sub(r"KerrickTrial", "Kerrick Trial", text)
    text = re.sub(r"abstorm", "Alberta Storm", text)
    text = re.sub(r"Beyhive", "Beyonce hive", text)
    text = re.sub(r"IDFire", "Idaho Fire", text)
    text = re.sub(r"DETECTADO", "Detected", text)
    text = re.sub(r"RockyFire", "Rocky Fire", text)
    text = re.sub(r"Listen/Buy", "Listen / Buy", text)
    text = re.sub(r"NickCannon", "Nick Cannon", text)
    text = re.sub(r"FaroeIslands", "Faroe Islands", text)
    text = re.sub(r"yycstorm", "Calgary Storm", text)
    text = re.sub(r"IDPs:", "Internally Displaced People :", text)
    text = re.sub(r"ArtistsUnited", "Artists United", text)
    text = re.sub(r"ClaytonBryant", "Clayton Bryant", text)
    text = re.sub(r"jimmyfallon", "jimmy fallon", text)
    text = re.sub(r"justinbieber", "justin bieber", text)  
    text = re.sub(r"UTC2015", "UTC 2015", text)
    text = re.sub(r"Time2015", "Time 2015", text)
    text = re.sub(r"djicemoon", "dj icemoon", text)
    text = re.sub(r"LivingSafely", "Living Safely", text)
    text = re.sub(r"FIFA16", "Fifa 2016", text)
    text = re.sub(r"thisiswhywecanthavenicethings", "this is why we cannot have nice things", text)
    text = re.sub(r"bbcnews", "bbc news", text)
    text = re.sub(r"UndergroundRailraod", "Underground Railraod", text)
    text = re.sub(r"c4news", "c4 news", text)
    text = re.sub(r"OBLITERATION", "obliteration", text)
    text = re.sub(r"MUDSLIDE", "mudslide", text)
    text = re.sub(r"NoSurrender", "No Surrender", text)
    text = re.sub(r"NotExplained", "Not Explained", text)
    text = re.sub(r"greatbritishbakeoff", "great british bake off", text)
    text = re.sub(r"LondonFire", "London Fire", text)
    text = re.sub(r"KOTAWeather", "KOTA Weather", text)
    text = re.sub(r"LuchaUnderground", "Lucha Underground", text)
    text = re.sub(r"KOIN6News", "KOIN 6 News", text)
    text = re.sub(r"LiveOnK2", "Live On K2", text)
    text = re.sub(r"9NewsGoldCoast", "9 News Gold Coast", text)
    text = re.sub(r"nikeplus", "nike plus", text)
    text = re.sub(r"david_cameron", "David Cameron", text)
    text = re.sub(r"peterjukes", "Peter Jukes", text)
    text = re.sub(r"JamesMelville", "James Melville", text)
    text = re.sub(r"megynkelly", "Megyn Kelly", text)
    text = re.sub(r"cnewslive", "C News Live", text)
    text = re.sub(r"JamaicaObserver", "Jamaica Observer", text)
    text = re.sub(r"textLikeItsSeptember11th2001", "text like it is september 11th 2001", text)
    text = re.sub(r"cbplawyers", "cbp lawyers", text)
    text = re.sub(r"fewmoretexts", "few more texts", text)
    text = re.sub(r"BlackLivesMatter", "Black Lives Matter", text)
    text = re.sub(r"cjoyner", "Chris Joyner", text)
    text = re.sub(r"ENGvAUS", "England vs Australia", text)
    text = re.sub(r"ScottWalker", "Scott Walker", text)
    text = re.sub(r"MikeParrActor", "Michael Parr", text)
    text = re.sub(r"4PlayThursdays", "Foreplay Thursdays", text)
    text = re.sub(r"TGF2015", "Tontitown Grape Festival", text)
    text = re.sub(r"realmandyrain", "Mandy Rain", text)
    text = re.sub(r"GraysonDolan", "Grayson Dolan", text)
    text = re.sub(r"ApolloBrown", "Apollo Brown", text)
    text = re.sub(r"saddlebrooke", "Saddlebrooke", text)
    text = re.sub(r"TontitownGrape", "Tontitown Grape", text)
    text = re.sub(r"AbbsWinston", "Abbs Winston", text)
    text = re.sub(r"ShaunKing", "Shaun King", text)
    text = re.sub(r"MeekMill", "Meek Mill", text)
    text = re.sub(r"TornadoGiveaway", "Tornado Giveaway", text)
    text = re.sub(r"GRupdates", "GR updates", text)
    text = re.sub(r"SouthDowns", "South Downs", text)
    text = re.sub(r"braininjury", "brain injury", text)
    text = re.sub(r"auspol", "Australian politics", text)
    text = re.sub(r"PlannedParenthood", "Planned Parenthood", text)
    text = re.sub(r"calgaryweather", "Calgary Weather", text)
    text = re.sub(r"weallheartonedirection", "we all heart one direction", text)
    text = re.sub(r"edsheeran", "Ed Sheeran", text)
    text = re.sub(r"TrueHeroes", "True Heroes", text)
    text = re.sub(r"S3XLEAK", "sex leak", text)
    text = re.sub(r"ComplexMag", "Complex Magazine", text)
    text = re.sub(r"TheAdvocateMag", "The Advocate Magazine", text)
    text = re.sub(r"CityofCalgary", "City of Calgary", text)
    text = re.sub(r"EbolaOutbreak", "Ebola Outbreak", text)
    text = re.sub(r"SummerFate", "Summer Fate", text)
    text = re.sub(r"RAmag", "Royal Academy Magazine", text)
    text = re.sub(r"offers2go", "offers to go", text)
    text = re.sub(r"foodscare", "food scare", text)
    text = re.sub(r"MNPDNashville", "Metropolitan Nashville Police Department", text)
    text = re.sub(r"TfLBusAlerts", "TfL Bus Alerts", text)
    text = re.sub(r"GamerGate", "Gamer Gate", text)
    text = re.sub(r"IHHen", "Humanitarian Relief", text)
    text = re.sub(r"spinningbot", "spinning bot", text)
    text = re.sub(r"ModiMinistry", "Modi Ministry", text)
    text = re.sub(r"TAXIWAYS", "taxi ways", text)
    text = re.sub(r"Calum5SOS", "Calum Hood", text)
    text = re.sub(r"po_st", "po.st", text)
    text = re.sub(r"scoopit", "scoop.it", text)
    text = re.sub(r"UltimaLucha", "Ultima Lucha", text)
    text = re.sub(r"JonathanFerrell", "Jonathan Ferrell", text)
    text = re.sub(r"aria_ahrary", "Aria Ahrary", text)
    text = re.sub(r"rapidcity", "Rapid City", text)
    text = re.sub(r"OutBid", "outbid", text)
    text = re.sub(r"lavenderpoetrycafe", "lavender poetry cafe", text)
    text = re.sub(r"EudryLantiqua", "Eudry Lantiqua", text)
    text = re.sub(r"15PM", "15 PM", text)
    text = re.sub(r"OriginalFunko", "Funko", text)
    text = re.sub(r"rightwaystan", "Richard Tan", text)
    text = re.sub(r"CindyNoonan", "Cindy Noonan", text)
    text = re.sub(r"RT_America", "RT America", text)
    text = re.sub(r"narendramodi", "Narendra Modi", text)
    text = re.sub(r"BakeOffFriends", "Bake Off Friends", text)
    text = re.sub(r"TeamHendrick", "Hendrick Motorsports", text)
    text = re.sub(r"alexbelloli", "Alex Belloli", text)
    text = re.sub(r"itsjustinstuart", "Justin Stuart", text)
    text = re.sub(r"gunsense", "gun sense", text)
    text = re.sub(r"DebateQuestionsWeWantToHear", "debate questions we want to hear", text)
    text = re.sub(r"RoyalCarribean", "Royal Carribean", text)
    text = re.sub(r"samanthaturne19", "Samantha Turner", text)
    text = re.sub(r"JonVoyage", "Jon Stewart", text)
    text = re.sub(r"renew911health", "renew 911 health", text)
    text = re.sub(r"SuryaRay", "Surya Ray", text)
    text = re.sub(r"pattonoswalt", "Patton Oswalt", text)
    text = re.sub(r"minhazmerchant", "Minhaz Merchant", text)
    text = re.sub(r"TLVFaces", "Israel Diaspora Coalition", text)
    text = re.sub(r"pmarca", "Marc Andreessen", text)
    text = re.sub(r"pdx911", "Portland Police", text)
    text = re.sub(r"jamaicaplain", "Jamaica Plain", text)
    text = re.sub(r"Japton", "Arkansas", text)
    text = re.sub(r"RouteComplex", "Route Complex", text)
    text = re.sub(r"INSubcontinent", "Indian Subcontinent", text)
    text = re.sub(r"NJTurnpike", "New Jersey Turnpike", text)
    text = re.sub(r"Politifiact", "PolitiFact", text)
    text = re.sub(r"Hiroshima70", "Hiroshima", text)
    text = re.sub(r"GMMBC", "Greater Mt Moriah Baptist Church", text)
    text = re.sub(r"versethe", "verse the", text)
    text = re.sub(r"TubeStrike", "Tube Strike", text)
    text = re.sub(r"MissionHills", "Mission Hills", text)
    text = re.sub(r"ProtectDenaliWolves", "Protect Denali Wolves", text)
    text = re.sub(r"NANKANA", "Nankana", text)
    text = re.sub(r"SAHIB", "Sahib", text)
    text = re.sub(r"PAKPATTAN", "Pakpattan", text)
    text = re.sub(r"Newz_Sacramento", "News Sacramento", text)
    text = re.sub(r"gofundme", "go fund me", text)
    text = re.sub(r"pmharper", "Stephen Harper", text)
    text = re.sub(r"IvanBerroa", "Ivan Berroa", text)
    text = re.sub(r"LosDelSonido", "Los Del Sonido", text)
    text = re.sub(r"bancodeseries", "banco de series", text)
    text = re.sub(r"timkaine", "Tim Kaine", text)
    text = re.sub(r"IdentityTheft", "Identity Theft", text)
    text = re.sub(r"AllLivesMatter", "All Lives Matter", text)
    text = re.sub(r"mishacollins", "Misha Collins", text)
    text = re.sub(r"BillNeelyNBC", "Bill Neely", text)
    text = re.sub(r"BeClearOnCancer", "be clear on cancer", text)
    text = re.sub(r"Kowing", "Knowing", text)
    text = re.sub(r"ScreamQueens", "Scream Queens", text)
    text = re.sub(r"AskCharley", "Ask Charley", text)
    
    # Urls
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)
        
    # Words with punctuations and special characters
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        text = text.replace(p, f' {p} ')
        
    # ... and ..
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')      
        
    # Acronyms
    text = re.sub(r"MH370", "Malaysia Airlines Flight 370", text)
    text = re.sub(r"mÌ¼sica", "music", text)
    text = re.sub(r"okwx", "Oklahoma City Weather", text)
    text = re.sub(r"arwx", "Arkansas Weather", text)    
    text = re.sub(r"gawx", "Georgia Weather", text)  
    text = re.sub(r"scwx", "South Carolina Weather", text)  
    text = re.sub(r"cawx", "California Weather", text)
    text = re.sub(r"tnwx", "Tennessee Weather", text)
    text = re.sub(r"azwx", "Arizona Weather", text)  
    text = re.sub(r"alwx", "Alabama Weather", text)
    text = re.sub(r"wordpressdotcom", "wordpress", text)    
    text = re.sub(r"usNWSgov", "United States National Weather Service", text)
    text = re.sub(r"Suruc", "Sanliurfa", text)   
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    
    return text

Measures the improvement in GloVe and FastText embeddings coverage after applying the clean function to the 'text' column of the 'data' DataFrame. 

In [35]:
%%time

data['text'] = data['text'].apply(clean)

train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(data['text'], glove_embeddings)
print('GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_glove_vocab_coverage, train_glove_text_coverage))

train_fasttext_oov, train_fasttext_vocab_coverage, train_fasttext_text_coverage = check_embeddings_coverage(data['text'], fasttext_embeddings)
print('FastText Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_fasttext_vocab_coverage, train_fasttext_text_coverage))

GloVe Embeddings cover 40.77% of vocabulary and 92.72% of text in Training Set
FastText Embeddings cover 40.72% of vocabulary and 92.72% of text in Training Set
CPU times: user 30.6 s, sys: 0 ns, total: 30.6 s
Wall time: 30.6 s


After cleaning the tweets, glove_embeddings and fasttext_embeddings are deleted and garbage collected because they consume too much memory

In [36]:
del glove_embeddings, fasttext_embeddings, train_glove_oov, train_fasttext_oov
gc.collect()

0

# Preprocessing

Assigns the 'text' column to the variable X_data and the 'target' column to the variable y_data.

In [37]:
data = data.reindex(columns=['id', 'text', 'target'])

X_data = data['text']
y_data = data['target']

X_data

0                rt   mayasolov   woman complain clean h...
1                    rt      boy dat cold      tyga dwn ...
2                        rt   urkindofbrand dawg        ...
3                            rt   cganderson     vivabas...
4                                    rt   shenikarobert ...
                                ...                        
135551    عاجل سماحة   السيدعبدالملكبدرالدينالحوثي  نصره...
135552    million   yemen  particip mass ralli  various ...
135553      abeshinzo   realdonaldtrump   shinzoab 独裁者は行...
135554    million   yemen  particip mass ralli  various ...
135555    لا تتشمت الرجال مسكين يعاني كس امه 😂   يقول يا...
Name: text, Length: 161336, dtype: object

Data is divided in train and test

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

121002 121002
40334 40334


## Count Vectorizer and Stop Words

The **CountVectorizer** operates on each individual text and performs the following steps:

* **Tokenization**: It splits each document into individual words or terms, which are also referred to as tokens. 
* **Counting**: It counts the occurrence of each token in each document and creates a matrix where columns represent unique tokens.
* **Vectorization**: It assigns a numerical value (count) to each token in each document, indicating how many times the token appears in that document.
* **Vocabulary** **Creation**: It builds a vocabulary of unique tokens based on the training data. Each token corresponds to a specific column in the matrix.
* **Transforming** **Test** **Data**: When applied to test data, the CountVectorizer uses the learned vocabulary from the training data and creates the matrix of token counts using the same columns as in the training matrix.

**Stop words** are a set of commonly used words in a language that are considered insignificant and are often removed during text preprocessing or natural language processing tasks. These words are filtered out because they typically do not carry much meaning or contribute significantly to the overall understanding of the text. Examples of stop words in English include "the," "is," "and," "a," "an," and so on.

In [39]:
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
x_train_vectorizer=count.fit_transform(x_train)
x_test_vectorizer=count.transform(x_test)

In [40]:
count.vocabulary_

{'rt': 1229132,
 'ohuhwhoop': 1028306,
 'staci': 1369636,
 'mom': 949836,
 'shit': 1304053,
 'twitter': 1514573,
 'bitch': 143946,
 'rt ohuhwhoop': 1242531,
 'ohuhwhoop staci': 1028307,
 'staci mom': 1369641,
 'mom shit': 950773,
 'shit twitter': 1309645,
 'twitter bitch': 1514680,
 'rt ohuhwhoop staci': 1242532,
 'ohuhwhoop staci mom': 1028308,
 'staci mom shit': 1369642,
 'mom shit twitter': 950774,
 'shit twitter bitch': 1309646,
 'rt ohuhwhoop staci mom': 1242533,
 'ohuhwhoop staci mom shit': 1028309,
 'staci mom shit twitter': 1369643,
 'mom shit twitter bitch': 950775,
 'rt ohuhwhoop staci mom shit': 1242534,
 'ohuhwhoop staci mom shit twitter': 1028310,
 'staci mom shit twitter bitch': 1369644,
 'cool': 296359,
 'fact': 457551,
 'think': 1451489,
 'bi': 133632,
 'guy': 602031,
 'girl': 557399,
 'best': 126958,
 'cool fact': 296554,
 'fact think': 458906,
 'think bi': 1452152,
 'bi guy': 134025,
 'guy girl': 603404,
 'girl best': 557788,
 'cool fact think': 296555,
 'fact think b

# Models

* [**XGBoost**](##XGBoost)
* [**Keras**](##Keras)

## XGBoost

XGBoost (Extreme Gradient Boosting) is a powerful machine learning algorithm used for classification and regression tasks. It combines multiple weak models, typically decision trees, to create a stronger and more accurate model.

Despite the intention to employ a neural network for text classification, an XGBoost model has been trained to evaluate its performance and compare it with the results of the neural network.

### TF-IDF

TF-IDF is a commonly used technique in Natural Language Processing (NLP) to quantify the importance of words in a document within a collection of documents. <br>
* **Term Frequency (TF):** It calculates the frequency of each word in a document. This step gives a measure of how often a word occurs in a document.
Inverse Document Frequency (IDF): It calculates the importance of each word in the entire collection of documents. This step penalizes words that occur frequently across all documents and rewards words that are unique to a particular document.<br><br>
* **Inverse Document Frequency (IDF):** It calculates the importance of each word in the entire collection of documents. This step penalizes words that occur frequently across all documents and rewards words that are unique to a particular document.

In [32]:
tfidf = TfidfTransformer()

x_train_tfidf = tfidf.fit_transform(x_train_vectorizer)

x_test_tfidf = tfidf.transform(x_test_vectorizer)


XGBoost Training

In [33]:
xgb_model=xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc' )

### XGBoost with vectorizer data

In [34]:
xgb_model_vectorizer = xgb_model.fit(x_train_vectorizer, y_train)
xgb_predictions_vectorizer=xgb_model_vectorizer.predict(x_test_vectorizer)
print(confusion_matrix(y_test,xgb_predictions_vectorizer))
print (classification_report(y_test, xgb_predictions_vectorizer))

[[4814  138]
 [ 881  363]]
              precision    recall  f1-score   support

           0       0.85      0.97      0.90      4952
           1       0.72      0.29      0.42      1244

    accuracy                           0.84      6196
   macro avg       0.78      0.63      0.66      6196
weighted avg       0.82      0.84      0.81      6196



### XGBoost with tf-idf data

In [35]:
xgb_model = xgb_model.fit(x_train_tfidf, y_train)
xgb_predictions=xgb_model.predict(x_test_tfidf)
print(confusion_matrix(y_test,xgb_predictions))
print (classification_report(y_test, xgb_predictions))

[[4751  201]
 [ 839  405]]
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      4952
           1       0.67      0.33      0.44      1244

    accuracy                           0.83      6196
   macro avg       0.76      0.64      0.67      6196
weighted avg       0.81      0.83      0.81      6196



## Keras

Text data is being processed using the Keras library.

In [41]:
max_words = 50000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

## Arquitecture

* **Embedding Layer:** This layer converts input words into dense vectors of fixed length. The vocabulary size is set to 50,000 words, and each word is represented as a 100-dimensional vector. The input size of this layer is a maximum length of 300 words.

* **Spatial Dropout1D Layer:** This layer applies dropout to prevent overfitting. It randomly drops out entire channels (feature maps) instead of individual neurons. In this case, 20% of the outputs from the Embedding layer are randomly set to 0.

* **LSTM Layer:** This layer utilizes LSTM (Long Short-Term Memory) units to model the sequence of words in the text. Each LSTM unit has 100 memory cells and can capture long-term patterns in sequential data. The LSTM layer also applies dropout with a rate of 20% on the recurrent connections to prevent overfitting.

* **Dense Layer:** This is the output layer of the model, consisting of a single neuron with a sigmoid activation function. It produces an output between 0 and 1, representing the probability of the text instance belonging to a particular class (e.g., positive or negative).

In [42]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          5000000   
                                                                 
 spatial_dropout1d (Spatial  (None, 300, 100)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Training

**EarlyStopping:**
* monitor='val_accuracy': It monitors the validation accuracy during training.
* mode='max': It maximizes the monitored metric (validation accuracy in this case).
* patience=5: It specifies the number of epochs to wait before stopping the training process if the monitored metric doesn't improve.

<br>

**ModelCheckpoint:**
* filepath='./keras': It specifies the path and filename to save the model weights.
* save_weights_only=True: It indicates that only the weights of the best model will be saved, not the entire model.
* monitor='val_accuracy': It monitors the validation accuracy during training.
* mode='max': It maximizes the monitored metric (validation accuracy in this case).
* save_best_only=True: It saves only the weights of the best model based on the monitored metric.


In [43]:
stop = EarlyStopping(
    monitor='val_accuracy', 
    mode='max',
    patience=5
)

checkpoint= ModelCheckpoint(
    filepath='./keras',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [44]:
history=model.fit(sequences_matrix,y_train,batch_size=1024,epochs=10,
            validation_split=0.2,callbacks=[stop,checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluation

In [45]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [46]:
accr = model.evaluate(test_sequences_matrix,y_test)



In [47]:
lstm_prediction=model.predict(test_sequences_matrix)



In [48]:
res=[]
for prediction in lstm_prediction:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

In [49]:
print(confusion_matrix(y_test,res))

[[22267  2931]
 [ 4941 10195]]


In [50]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [54]:
model.save("hate_model.h5")

# Real Test

In [4]:
load_model=keras.models.load_model("./hate_model.h5")
with open('tokenizer.pickle', 'rb') as handle:
    load_tokenizer = pickle.load(handle)

2023-07-18 16:58:57.968381: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-07-18 16:58:57.968407: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-07-18 16:58:57.968412: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-07-18 16:58:57.968445: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-07-18 16:58:57.968461: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)






In [5]:
test = 'I hate you idiot'

test=[clean(test)]
print(test)
seq = load_tokenizer.texts_to_sequences(test)
padded = sequence.pad_sequences(seq, maxlen=300)
print(seq)
pred = load_model.predict(padded)
print("pred", pred)
if pred<0.3:
    print("no hate")
else:
    print("hate")

['hate idiot']
[[12, 172]]


2023-07-18 16:59:00.346804: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


pred [[0.37440547]]
hate
