In [1]:
import pandas as pd
import numpy as np
import spacy
import string
import re
import nltk
from collections import Counter
from spellchecker import SpellChecker
from defines import *

In [2]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

In [3]:
TO_LOWER = True
REMOVE_EMOJI = True
REMOVE_EMOTICONS = True
CHAT_WORDS_CONVERSION = True
SPELL_CORRECT = True
ADD_SPACE = True
REMOVE_PUNCT = True
REMOVE_STOPWORDS = True
REMOVE_FREQ = False
REMOVE_RARES = False
REMOVE_URL = False
GET_LEMMA = True

In [4]:
df = pd.read_csv('TrainingDS.csv')
df_sample = pd.read_csv('Sample Submission.csv')
df_test = pd.read_csv('TestingDS.csv')
df.head()

Unnamed: 0,ID,Text,Class
0,1,Trump supporters needed to say the 4 Democrats...,0
1,2,Send them back!!Why the hell are they even her...,1
2,3,Yeah...Im wondering if send them back works fo...,1
3,4,I know you realize you cant pretend that you d...,1
4,5,"Donny, you owe all people an apology for appla...",0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1500 non-null   int64 
 1   Text    1500 non-null   object
 2   Class   1500 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 35.3+ KB


In [6]:
df.describe()

Unnamed: 0,ID,Class
count,1500.0,1500.0
mean,750.5,0.386667
std,433.157015,0.487149
min,1.0,0.0
25%,375.75,0.0
50%,750.5,0.0
75%,1125.25,1.0
max,1500.0,1.0


In [7]:
# string to lower
if TO_LOWER:
    df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,ID,Text,Class
0,1,trump supporters needed to say the 4 democrats...,0
1,2,send them back!!why the hell are they even her...,1
2,3,yeah...im wondering if send them back works fo...,1
3,4,i know you realize you cant pretend that you d...,1
4,5,"donny, you owe all people an apology for appla...",0


In [8]:
# Removal of Emojis
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

if REMOVE_EMOJI:
    df["Text"] = df["Text"].apply(remove_emoji)

In [9]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

if REMOVE_EMOTICONS:
    df["Text"] = df["Text"].apply(remove_emoticons)

In [10]:
# Chat Words Conversion
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

if CHAT_WORDS_CONVERSION:
    df["Text"] = df["Text"].apply(chat_words_conversion)


In [11]:
# Spelling Correction
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
if SPELL_CORRECT:
    df["Text"] = df["Text"].apply(correct_spellings)

In [12]:
# add spaces after dots and comma
def addSpace(val):
    return re.sub(r'(\.+|\,+)', r'\1 ', val)

if ADD_SPACE:
    df["Text"] = df["Text"].apply(addSpace)

In [13]:
# remove punct
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

if REMOVE_PUNCT:
    df["Text"] = df["Text"].apply(remove_punctuation)

In [14]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

if REMOVE_STOPWORDS:
    df["Text"] = df["Text"].apply(remove_stopwords)

In [15]:
# Freq words
cnt = Counter()
for text in df["Text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('send', 1187),
 ('trump', 490),
 ('chant', 334),
 ('racist', 241),
 ('chants', 190),
 ('illegal', 184),
 ('rally', 179),
 ('people', 168),
 ('dont', 135),
 ('camp', 131)]

In [16]:
# Removal of Frequent words
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

if REMOVE_FREQ:
    df["Text"] = df["Text"].apply(remove_freqwords)

In [17]:
# Removal of Rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

if REMOVE_RARES:
    df["Text"] = df["Text"].apply(remove_rarewords)

In [18]:
# Removal of URL
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

if REMOVE_URL:
    df["Text"] = df["Text"].apply(remove_urls)

In [19]:
# Lemmatization
def get_lemma(text):
    doc = nlp(text)
    return ' '.join([tok.lemma_ for tok in doc])

if GET_LEMMA:
    df["Text"] = df["Text"].apply(get_lemma)

In [20]:
df.to_csv('ready_df.csv', sep=';', index=False)