## Data and Imports

In [18]:
import pandas as pd

In [19]:
train_df = pd.read_csv("../Data/raw_data/Train.csv")
val_df = pd.read_csv("../Data/raw_data/Valid.csv")

In [20]:
train_df

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [21]:
val_df

Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0
2,The Guidelines state that a comment must conta...,0
3,This movie is a muddled mish-mash of clichés f...,0
4,Before Stan Laurel became the smaller half of ...,0
...,...,...
4995,"Man, I loved this movie! This really takes me ...",1
4996,Recovery is an incredibly moving piece of work...,1
4997,"You can take the crook out of the joint, but i...",1
4998,FUTZ is the only show preserved from the exper...,1


In [22]:
# y-feature distribution
display(train_df['label'].unique())
display(train_df['label'].value_counts())

display(val_df['label'].unique())
display(val_df['label'].value_counts())

array([0, 1], dtype=int64)

0    20019
1    19981
Name: label, dtype: int64

array([0, 1], dtype=int64)

1    2514
0    2486
Name: label, dtype: int64

**NOTE** Even Distribution of 1 and 0, which is good for model building

## Preprocessing

In [47]:
# Libraries for Text Cleaning 
import re
import nltk
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from collections import Counter
from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
# Functions Defined 
# 2
mention_re = re.compile("@\w+")
def remove_mention(text):
    return mention_re.sub(repl=" ", string=text)

# 3
# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text

# 4
def remove_urls(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text)

# 5
def remove_html(text):
    return BeautifulSoup(text, "html.parser").text

# 6
# removing symbols
symb_re = re.compile(r"""[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~،؟…«“\":\"…”]""")
def remove_symbols(text: str) -> str:
    return symb_re.sub(repl="", string=text)
# removing numbers
numbers_re = re.compile("\d+")
def remove_numbers(text):
    # TODO: Implement remove numbers
    return numbers_re.sub(repl="", string=text)

# 7
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# 8 
cnt = Counter()
for text in train_df['text'].values:
    for word in text.split():
        cnt[word] += 1
# Removing the frequent words
freq = set([w for (w, wc) in cnt.most_common(150)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not 
in freq])

# 9 
rx = re.compile(r'([^\W\d_])\1{2,}')
def remove_repeating_characters(text: str):
    return re.sub(r'[^\W\d_]+', lambda x: Word(rx.sub(r'\1\1', x.group())).correct() \
                  if rx.search(x.group()) else x.group(), text)
# 10
multiple_space_re = re.compile("\s{2,}")
def remove_multiple_whitespace(text):
    return multiple_space_re.sub(repl=" ", string=text)

In [49]:
def text_cleaning(txt):
    txt = remove_mention(txt)
    # txt = convert_emojis(txt)
    # txt = convert_emoticons(txt)
    txt = remove_urls(txt)
    txt = remove_html(txt)
    txt = remove_stopwords(txt)
    txt = remove_symbols(txt)
    txt = remove_numbers(txt)   # might be removed as it can result in loss of information for the textual data
    txt = re.sub("[^a-zA-Z0-9\s]+", "",txt)
    # txt = remove_freqwords(txt) 
    # txt = str(TextBlob(txt).correct())  # word correction
    txt = txt.lower().strip()
    # txt = remove_repeating_characters(txt)
    txt = remove_multiple_whitespace(txt)
    
    return txt

In [50]:
# NOTE: no text cleaning applied
train_df['text_cleaned'] = train_df['text'].apply(text_cleaning)

In [51]:
val_df['text_cleaned'] = val_df['text'].apply(text_cleaning)

In [52]:
train_df.to_csv("../Data/processed/Train_clean.csv", index=False)
val_df.to_csv("../Data/processed/Valid_clean.csv", index=False)