In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from multiprocessing import Pool

In [2]:
reviews_df = pd.read_csv("../data/raw/final_project_train_dataset/train.csv", sep=',')
reviews_df.head(5)

Unnamed: 0,review,sentiment
0,I caught this little gem totally by accident b...,positive
1,I can't believe that I let myself into this mo...,negative
2,*spoiler alert!* it just gets to me the nerve ...,negative
3,If there's one thing I've learnt from watching...,negative
4,"I remember when this was in theaters, reviews ...",negative


# Dataset preprocessing.

## Main features extraction.

In [3]:
reviews_df['number_of_chars'] = reviews_df['review'].apply(len)
reviews_df['percentage_of_signs'] = reviews_df['review'].apply(lambda x: sum([1 for c in x if not c.isalpha()]) / len(x) * 100)
reviews_df['number_of_excl_marks'] = reviews_df['review'].apply(lambda x: x.count('!'))
reviews_df['number_of_question_marks'] = reviews_df['review'].apply(lambda x: x.count('?'))
reviews_df['number_of_ellipses'] = reviews_df['review'].apply(lambda x: x.count('...'))
reviews_df['number_of_uppercase_words'] = reviews_df['review'].apply(lambda x: sum([1 for w in x.split() if re.sub(r'[^a-zA-Z]', '', w).isupper()]))

numerical_review_features = [
    'number_of_chars',
    'percentage_of_signs',
    'number_of_excl_marks',
    'number_of_question_marks',
    'number_of_ellipses',
    'number_of_uppercase_words'
]

## Duplicates removal.

In [4]:
len(reviews_df)

40000

In [5]:
reviews_df.drop_duplicates(inplace=True)
len(reviews_df)

39728

## Outliers removal.

In [6]:
reviews_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_chars,39728.0,1311.359469,988.79897,41.0,699.0,971.5,1596.0,13704.0
percentage_of_signs,39728.0,21.976804,1.827637,11.764706,20.802836,21.829396,22.939068,87.311178
number_of_excl_marks,39728.0,0.972563,2.964011,0.0,0.0,0.0,1.0,282.0
number_of_question_marks,39728.0,0.646018,1.497642,0.0,0.0,0.0,1.0,35.0
number_of_ellipses,39728.0,0.499522,1.58329,0.0,0.0,0.0,0.0,48.0
number_of_uppercase_words,39728.0,4.877014,5.592917,0.0,1.0,3.0,6.0,151.0


In [7]:
# calculate IQR for column 'number_of_chars'
Q1 = reviews_df['number_of_chars'].quantile(0.25)
Q3 = reviews_df['number_of_chars'].quantile(0.75)
IQR = Q3 - Q1

# identify outliers
threshold = 1.5
outliers = reviews_df[(reviews_df['number_of_chars'] < Q1 - threshold * IQR) | (reviews_df['number_of_chars'] > Q3 + threshold * IQR)]

In [8]:
len(outliers)

2958

In [9]:
reviews_df.drop(outliers.index, inplace=True)
len(reviews_df)

36770

In [10]:
reviews_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_chars,36770.0,1094.826435,595.737973,41.0,685.0,918.0,1393.0,2941.0
percentage_of_signs,36770.0,22.011077,1.863173,11.764706,20.813033,21.868365,23.006231,87.311178
number_of_excl_marks,36770.0,0.909872,2.880267,0.0,0.0,0.0,1.0,282.0
number_of_question_marks,36770.0,0.548545,1.281312,0.0,0.0,0.0,1.0,25.0
number_of_ellipses,36770.0,0.463204,1.469358,0.0,0.0,0.0,0.0,48.0
number_of_uppercase_words,36770.0,4.372042,4.524514,0.0,1.0,3.0,6.0,122.0


# Text preprocessing.

## Punctuation and numbers processing.

In [11]:
PUNCT_TO_REMOVE = string.punctuation

reviews_df['cleaned_review'] = reviews_df['review'].apply(lambda x: x.replace('<br />', ' ')) \
                                           .apply(lambda x: x.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) \
                                           .apply(lambda x: re.sub(r'[0-9]+', '', x)) 

## Stop-words removal.

In [12]:
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
STOPWORDS = STOPWORDS.union(set([w.title() for w in STOPWORDS]))

STOPWORDS

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ekaterina_Dul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'A',
 'About',
 'Above',
 'After',
 'Again',
 'Against',
 'Ain',
 'All',
 'Am',
 'An',
 'And',
 'Any',
 'Are',
 'Aren',
 "Aren'T",
 'As',
 'At',
 'Be',
 'Because',
 'Been',
 'Before',
 'Being',
 'Below',
 'Between',
 'Both',
 'But',
 'By',
 'Can',
 'Couldn',
 "Couldn'T",
 'D',
 'Did',
 'Didn',
 "Didn'T",
 'Do',
 'Does',
 'Doesn',
 "Doesn'T",
 'Doing',
 'Don',
 "Don'T",
 'Down',
 'During',
 'Each',
 'Few',
 'For',
 'From',
 'Further',
 'Had',
 'Hadn',
 "Hadn'T",
 'Has',
 'Hasn',
 "Hasn'T",
 'Have',
 'Haven',
 "Haven'T",
 'Having',
 'He',
 'Her',
 'Here',
 'Hers',
 'Herself',
 'Him',
 'Himself',
 'His',
 'How',
 'I',
 'If',
 'In',
 'Into',
 'Is',
 'Isn',
 "Isn'T",
 'It',
 "It'S",
 'Its',
 'Itself',
 'Just',
 'Ll',
 'M',
 'Ma',
 'Me',
 'Mightn',
 "Mightn'T",
 'More',
 'Most',
 'Mustn',
 "Mustn'T",
 'My',
 'Myself',
 'Needn',
 "Needn'T",
 'No',
 'Nor',
 'Not',
 'Now',
 'O',
 'Of',
 'Off',
 'On',
 'Once',
 'Only',
 'Or',
 'Other',
 'Our',
 'Ours',
 'Ourselves',
 'Out',
 'Over',
 'Own',
 'R

In [13]:
reviews_df['cleaned_review'] = reviews_df['cleaned_review'].apply(lambda x: " ".join([word for word in x.split() if word not in STOPWORDS]))

In [14]:
reviews_df['cleaned_review'].head(10)

0     caught little gem totally accident back reviva...
1     cant believe let movie accomplish favor friend...
2     spoiler alert gets nerve people remake use ter...
3     theres one thing Ive learnt watching George Ro...
4     remember theaters reviews said horrible Well d...
5     Opera US title terror opera somewhat letdown D...
6     Heard film long ago finally found ebay five bu...
8     worth mentioning omitted reviews read subtext ...
9     Darling Lili fantastic far one favorite films ...
10    Twentieth CenturyFox made ton Mr Moto films Ho...
Name: cleaned_review, dtype: object

## Tokenization.

In [15]:
# Function to be applied to each element
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Function to handle multiprocessing
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Helper function to apply to each split
def apply_tokenize(df):
    df['tokenized_review'] = df['cleaned_review'].apply(tokenize_text)
    return df

In [16]:
# Apply the function in parallel
# reviews_df = parallelize_dataframe(reviews_df, apply_tokenize)

# reviews_df['tokenized_review'].head(5)