In [1]:
import pandas as pd
import numpy as np
import spacy
import string
import re
import nltk
from collections import Counter

In [2]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

In [3]:
df = pd.read_csv('TrainingDS.csv')
df_sample = pd.read_csv('Sample Submission.csv')
df_test = pd.read_csv('TestingDS.csv')
df.head()

Unnamed: 0,ID,Text,Class
0,1,Trump supporters needed to say the 4 Democrats...,0
1,2,Send them back!!Why the hell are they even her...,1
2,3,Yeah...Im wondering if send them back works fo...,1
3,4,I know you realize you cant pretend that you d...,1
4,5,"Donny, you owe all people an apology for appla...",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1500 non-null   int64 
 1   Text    1500 non-null   object
 2   Class   1500 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 35.3+ KB


In [5]:
df.describe()

Unnamed: 0,ID,Class
count,1500.0,1500.0
mean,750.5,0.386667
std,433.157015,0.487149
min,1.0,0.0
25%,375.75,0.0
50%,750.5,0.0
75%,1125.25,1.0
max,1500.0,1.0


In [6]:
# string to lower
df["text_new"] = df["Text"].str.lower()
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,trump supporters needed to say the 4 democrats...
1,2,Send them back!!Why the hell are they even her...,1,send them back!!why the hell are they even her...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah...im wondering if send them back works fo...
3,4,I know you realize you cant pretend that you d...,1,i know you realize you cant pretend that you d...
4,5,"Donny, you owe all people an apology for appla...",0,"donny, you owe all people an apology for appla..."


In [7]:
# add spaces after symbols
def setSpace(val):
    return re.sub(r'(\.+|\,+)', r'\1 ', val)

df["text_new"] = df["text_new"].apply(setSpace)
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,trump supporters needed to say the 4 de...
1,2,Send them back!!Why the hell are they even her...,1,send them back! ! why the hell are they ...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah. . . im wondering if send them back ...
3,4,I know you realize you cant pretend that you d...,1,i know you realize you cant pretend tha...
4,5,"Donny, you owe all people an apology for appla...",0,"donny, you owe all people an apology f..."


In [8]:
# remove punct
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_new"] = df["text_new"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,trump supporters needed to say the 4 de...
1,2,Send them back!!Why the hell are they even her...,1,send them back why the hell are they e...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah im wondering if send them back wo...
3,4,I know you realize you cant pretend that you d...,1,i know you realize you cant pretend tha...
4,5,"Donny, you owe all people an apology for appla...",0,donny you owe all people an apology fo...


In [9]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_new"] = df["text_new"].apply(remove_stopwords)
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,trump supporters needed 4 democrats socialist ...
1,2,Send them back!!Why the hell are they even her...,1,send hell im sick hearing greatest country gre...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah im wondering send works wife 1 amp 3 ykno...
3,4,I know you realize you cant pretend that you d...,1,know realize cant pretend didnt know obama beg...
4,5,"Donny, you owe all people an apology for appla...",0,donny owe people apology applauding horrible c...


In [10]:
# Freq words
cnt = Counter()
for text in df["text_new"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('send', 1191),
 ('trump', 554),
 ('chant', 336),
 ('s', 306),
 ('racist', 241),
 ('t', 217),
 ('chants', 191),
 ('illegal', 183),
 ('rally', 177),
 ('people', 168)]

In [14]:
# Removal of Frequent words
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_new"] = df["text_new"].apply(remove_freqwords)
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,supporters needed 4 democrats socialist squad ...
1,2,Send them back!!Why the hell are they even her...,1,hell im sick hearing greatest country greatest...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah im wondering works wife 1 amp 3 yknow anc...
3,4,I know you realize you cant pretend that you d...,1,know realize cant pretend didnt know obama beg...
4,5,"Donny, you owe all people an apology for appla...",0,donny owe apology applauding horrible clap han...


In [15]:
# Removal of Rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_new"] = df["text_new"].apply(remove_rarewords)
df.head()

Unnamed: 0,ID,Text,Class,text_new
0,1,Trump supporters needed to say the 4 Democrats...,0,supporters needed 4 democrats socialist squad ...
1,2,Send them back!!Why the hell are they even her...,1,hell im sick hearing greatest country greatest...
2,3,Yeah...Im wondering if send them back works fo...,1,yeah im wondering works wife 1 amp 3 yknow anc...
3,4,I know you realize you cant pretend that you d...,1,know realize cant pretend didnt know obama beg...
4,5,"Donny, you owe all people an apology for appla...",0,donny owe apology applauding horrible clap han...


In [21]:
# Lemmatization
def get_lemma(text):
    doc = nlp(text)
    return ' '.join([tok.lemma_ for tok in doc])
df["text_new"] = df["text_new"].apply(get_lemma)
df.head()

In [None]:
# Removal of Emojis
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)