# Preprocessing

Summary
------
- Numericalize the sentiment label {"negative":0, "positive":1}
- Remove stopwords and punctuations, lowercase and lemmatize

In [1]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
from utils.TextProcessor import TextProcessor
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_parquet('../data/raw/IMDB Dataset.parquet')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Convert sentiment label into numeric

In [4]:
df['sentiment'] = df['sentiment'] == 'positive'
df['sentiment'] = df['sentiment'].astype(int)

## Look at default preprocessing

In [5]:
tp = TextProcessor()
df_default = tp.fit_transform(df=df, col='review')
df_default.head()

using wordnet stop list


Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching 1 oz episod...,1
1,wonderful little production . < br / > < br / ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically 's family little boy ( jake ) think ...,0
4,petter mattei 's `` love time money '' visuall...,1


## Improved stopwords list and remove punctuation

In [6]:
new_stoplist = stopwords.words('english') 
ponc_list = [p for p in string.punctuation]
new_stopwords = new_stoplist + ponc_list + ['br', '``', "'"]
_ = [new_stopwords.remove(w) for w in ['not','very','few','wasn','wouldn']]

tp = TextProcessor(stop_list=new_stopwords)


In [7]:
df_improved = tp.fit_transform(df=df, col='review')
df_improved.head()

using custom stop list


Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching 1 oz episod...,1
1,wonderful little production filming technique ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically 's family little boy jake think 's z...,0
4,petter mattei 's love time money visually stun...,1


## Create a vocabulary of the words

In [8]:
cv = CountVectorizer()

In [9]:
df_improved = cv.fit_transform(df_improved['review'])

In [10]:
pd.DataFrame([cv.vocabulary_]).T.sort_values(0).iloc[-20:]

Unnamed: 0,0
óli,94374
önsjön,94375
örnek,94376
özdemir,94377
østbye,94378
úber,94379
über,94380
übermensch,94381
übermenschlich,94382
überwoman,94383


There is definetly a different language here, I also saw some french words. Using google translate, I saw some:
- icelandic
- hebrew
- hungarian
- catalan   
- ...   

and more

In [11]:
# We can see that one of the most frequent french word is in the french stopwords list. Since I'm french, I can confirm that that verb isn't menaingful
# for sentiment classification
'était' in new_stopwords

False

In [12]:
# I don't think I can clean all of the reviews right now, but let's just add those two

In [13]:
# let's add turkish stopwords and french
fr_stopwords = stopwords.words('french')
tk_stopwords = stopwords.words('turkish')
new_stoplist = stopwords.words('english') 
ponc_list = [p for p in string.punctuation]
new_stopwords = new_stoplist + ponc_list + ['br', '``', "'", '"'] + fr_stopwords + tk_stopwords
_ = [new_stopwords.remove(w) for w in ['not','very','few','wasn','wouldn']]

tp = TextProcessor(stop_list=new_stopwords)
df_final = tp.fit_transform(df=df, col='review')
df_final.head()

using custom stop list


Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching 1 oz episod...,1
1,wonderful little production filming technique ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically 's family little boy jake think 's z...,0
4,petter mattei 's love time money visually stun...,1


# <span style="color:#FD6556">Shuffle and Split</span>

In [15]:
df_final = df_final.sample(frac=1)

df_final['role'] = np.random.uniform(size=50000)
df_final['role'] = df_final['role'].apply(lambda x: 'train' if x < .8 else 'test')
df_final.reset_index(drop=True, inplace=True)

df_final.to_parquet('../data/processed/imdb_dataset.parquet', compression='gzip')