In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from multiprocessing import Pool
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ekaterina_Dul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ekaterina_Dul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Ekaterina_Dul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ekaterina_Dul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
reviews_df = pd.read_csv("../data/raw/final_project_train_dataset/train.csv", sep=',')
reviews_df.head(5)

Unnamed: 0,review,sentiment
0,I caught this little gem totally by accident b...,positive
1,I can't believe that I let myself into this mo...,negative
2,*spoiler alert!* it just gets to me the nerve ...,negative
3,If there's one thing I've learnt from watching...,negative
4,"I remember when this was in theaters, reviews ...",negative


# Dataset preprocessing.

In the following section we will focus on general train dataset cleansing. 

## Main features extraction.

Extraction of significant numerical features from `review` column, based on EDA summary.

In [3]:
reviews_df['number_of_chars'] = reviews_df['review'].apply(len)
reviews_df['percentage_of_signs'] = reviews_df['review'].apply(lambda x: sum([1 for c in x if not c.isalpha()]) / len(x) * 100)
reviews_df['number_of_excl_marks'] = reviews_df['review'].apply(lambda x: x.count('!'))
reviews_df['number_of_question_marks'] = reviews_df['review'].apply(lambda x: x.count('?'))
reviews_df['number_of_ellipses'] = reviews_df['review'].apply(lambda x: x.count('...'))
reviews_df['number_of_uppercase_words'] = reviews_df['review'].apply(lambda x: sum([1 for w in x.split() if re.sub(r'[^a-zA-Z]', '', w).isupper()]))

numerical_review_features = [
    'number_of_chars',
    'percentage_of_signs',
    'number_of_excl_marks',
    'number_of_question_marks',
    'number_of_ellipses',
    'number_of_uppercase_words'
]

## Duplicates removal.

Removal of duplicated rows. As was discussed in EDA, in train dataset we don't have similar reviews with different sentiments, therefore no additional quality check is required.

In [4]:
len(reviews_df)

40000

In [5]:
reviews_df.drop_duplicates(inplace=True)
len(reviews_df)

39728

## Outliers removal.

In [6]:
reviews_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_chars,39728.0,1311.359469,988.79897,41.0,699.0,971.5,1596.0,13704.0
percentage_of_signs,39728.0,21.976804,1.827637,11.764706,20.802836,21.829396,22.939068,87.311178
number_of_excl_marks,39728.0,0.972563,2.964011,0.0,0.0,0.0,1.0,282.0
number_of_question_marks,39728.0,0.646018,1.497642,0.0,0.0,0.0,1.0,35.0
number_of_ellipses,39728.0,0.499522,1.58329,0.0,0.0,0.0,0.0,48.0
number_of_uppercase_words,39728.0,4.877014,5.592917,0.0,1.0,3.0,6.0,151.0


In [7]:
# calculate IQR for column 'number_of_chars'
Q1 = reviews_df['number_of_chars'].quantile(0.25)
Q3 = reviews_df['number_of_chars'].quantile(0.75)
IQR = Q3 - Q1

# identify outliers
threshold = 1.5
outliers = reviews_df[(reviews_df['number_of_chars'] < Q1 - threshold * IQR) | (reviews_df['number_of_chars'] > Q3 + threshold * IQR)]

In [8]:
len(outliers)

2958

In [9]:
reviews_df.drop(outliers.index, inplace=True)
len(reviews_df)

36770

In [10]:
reviews_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_chars,36770.0,1094.826435,595.737973,41.0,685.0,918.0,1393.0,2941.0
percentage_of_signs,36770.0,22.011077,1.863173,11.764706,20.813033,21.868365,23.006231,87.311178
number_of_excl_marks,36770.0,0.909872,2.880267,0.0,0.0,0.0,1.0,282.0
number_of_question_marks,36770.0,0.548545,1.281312,0.0,0.0,0.0,1.0,25.0
number_of_ellipses,36770.0,0.463204,1.469358,0.0,0.0,0.0,0.0,48.0
number_of_uppercase_words,36770.0,4.372042,4.524514,0.0,1.0,3.0,6.0,122.0


# Text preprocessing.

Here basic text preprocessing was preformed, and two columns with stemmed tokens and lemmatized tokens were produced.

## Punctuation and numbers processing.

As it was discussed in EDA, after extracting number of main punctuation marks (ellipses, exclamations, questions), we are ready to remove all punctuation, and not to include it in futher tokenization. Just a reminder, that it was done due to non-standart signs costrustion in initial reviews.

In [11]:
PUNCT_TO_REMOVE = string.punctuation

reviews_df['cleaned_review'] = reviews_df['review'].apply(lambda x: x.replace('<br />', ' ')) \
                                           .apply(lambda x: x.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) \
                                           .apply(lambda x: re.sub(r'[0-9]+', '', x)) \
                                           .apply(lambda x: ''.join(filter(lambda y: y in string.printable, x))) 

## Stop-words removal.

Removal of all possible standard stop-words variations.

In [12]:
STOPWORDS = set(stopwords.words('english'))
STOPWORDS = STOPWORDS.union(set([w.title() for w in STOPWORDS]))
STOPWORDS = STOPWORDS.union(set([w.translate(str.maketrans('', '', PUNCT_TO_REMOVE)) for w in STOPWORDS]))

STOPWORDS

{'A',
 'About',
 'Above',
 'After',
 'Again',
 'Against',
 'Ain',
 'All',
 'Am',
 'An',
 'And',
 'Any',
 'Are',
 'Aren',
 "Aren'T",
 'ArenT',
 'As',
 'At',
 'Be',
 'Because',
 'Been',
 'Before',
 'Being',
 'Below',
 'Between',
 'Both',
 'But',
 'By',
 'Can',
 'Couldn',
 "Couldn'T",
 'CouldnT',
 'D',
 'Did',
 'Didn',
 "Didn'T",
 'DidnT',
 'Do',
 'Does',
 'Doesn',
 "Doesn'T",
 'DoesnT',
 'Doing',
 'Don',
 "Don'T",
 'DonT',
 'Down',
 'During',
 'Each',
 'Few',
 'For',
 'From',
 'Further',
 'Had',
 'Hadn',
 "Hadn'T",
 'HadnT',
 'Has',
 'Hasn',
 "Hasn'T",
 'HasnT',
 'Have',
 'Haven',
 "Haven'T",
 'HavenT',
 'Having',
 'He',
 'Her',
 'Here',
 'Hers',
 'Herself',
 'Him',
 'Himself',
 'His',
 'How',
 'I',
 'If',
 'In',
 'Into',
 'Is',
 'Isn',
 "Isn'T",
 'IsnT',
 'It',
 "It'S",
 'ItS',
 'Its',
 'Itself',
 'Just',
 'Ll',
 'M',
 'Ma',
 'Me',
 'Mightn',
 "Mightn'T",
 'MightnT',
 'More',
 'Most',
 'Mustn',
 "Mustn'T",
 'MustnT',
 'My',
 'Myself',
 'Needn',
 "Needn'T",
 'NeednT',
 'No',
 'Nor',
 'No

In [13]:
reviews_df['cleaned_review'] = reviews_df['cleaned_review'].apply(lambda x: " ".join([word for word in x.split() if word not in STOPWORDS]))

In [14]:
reviews_df['cleaned_review'].head(10)

0     caught little gem totally accident back reviva...
1     cant believe let movie accomplish favor friend...
2     spoiler alert gets nerve people remake use ter...
3     theres one thing Ive learnt watching George Ro...
4     remember theaters reviews said horrible Well t...
5     Opera US title terror opera somewhat letdown D...
6     Heard film long ago finally found ebay five bu...
8     worth mentioning omitted reviews read subtext ...
9     Darling Lili fantastic far one favorite films ...
10    Twentieth CenturyFox made ton Mr Moto films Ho...
Name: cleaned_review, dtype: object

## Tokenization.

We are using standard NLTK words tokenization algorithm.

In [15]:
def tokenize_words(
    text: str
):
    import nltk
    nltk.download('punkt_tab')

    return nltk.tokenize.word_tokenize(text)

reviews_df['tokenized_review'] = reviews_df['cleaned_review'].parallel_apply(tokenize_words)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9193), Label(value='0 / 9193'))), …

## Lemmatization.

In computational linguistics, lemmatization is the algorithmic process of determining the lemma of a word based on its intended meaning. Unlike stemming, lemmatization depends on correctly identifying the intended part of speech and meaning of a word in a sentence, as well as within the larger context surrounding that sentence, such as neighbouring sentences or even an entire document.

In [16]:
def lemmatize_words(
    text: list
):
    import nltk
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger_eng')
    
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB, 
        "J": wordnet.ADJ, 
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text)
    return [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]

In [17]:
reviews_df['lemmatized_review'] = reviews_df['tokenized_review'].parallel_apply(lemmatize_words)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9193), Label(value='0 / 9193'))), …

In [18]:
reviews_df['lemmatized_review'].head(5)

0    [catch, little, gem, totally, accident, back, ...
1    [cant, believe, let, movie, accomplish, favor,...
2    [spoiler, alert, get, nerve, people, remake, u...
3    [there, one, thing, Ive, learnt, watch, George...
4    [remember, theater, review, say, horrible, Wel...
Name: lemmatized_review, dtype: object

## Stemming.

Stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.

In [19]:
def stem_words(
    text
):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    
    return [stemmer.stem(word) for word in text]

In [20]:
reviews_df['stemmed_review'] = reviews_df['tokenized_review'].parallel_apply(stem_words)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9193), Label(value='0 / 9193'))), …

In [21]:
reviews_df['stemmed_review'].head(5)

0    [caught, littl, gem, total, accid, back, reviv...
1    [cant, believ, let, movi, accomplish, favor, f...
2    [spoiler, alert, get, nerv, peopl, remak, use,...
3    [there, one, thing, ive, learnt, watch, georg,...
4    [rememb, theater, review, said, horribl, well,...
Name: stemmed_review, dtype: object

## Stemming and Lemmatization comparison.

In the following section we will analyze results from both stemming and lemmatization, based on the following criterias:
1. Number of unique tokens produced by stemming and lemmatization.
2. Difference between number of shorted tokens (with length < 2) produced by two approaches.
3. Modifications of initial tokens after operation preforming.

In [22]:
from collections import Counter

def count_words(
    df: pd.DataFrame,
    col_name: str
): 
    cntr = Counter()
    for processed_tokens in df[col_name].values:
        for token in processed_tokens:
            cntr[token] += 1
    return cntr

In [23]:
stemm_counter = count_words(reviews_df, 'stemmed_review')
lemm_counter = count_words(reviews_df, 'lemmatized_review')

In [24]:
print("Number of unique stemmed words:", len(stemm_counter))
stemm_counter.most_common(10)

Number of unique stemmed words: 93786


[('movi', 70630),
 ('film', 59314),
 ('one', 33908),
 ('like', 28273),
 ('good', 19876),
 ('time', 19668),
 ('watch', 19329),
 ('see', 18310),
 ('make', 18160),
 ('get', 17379)]

In [25]:
print("Number of unique lemmatized words:", len(lemm_counter))
lemm_counter.most_common(10)

Number of unique lemmatized words: 138953


[('movie', 69502),
 ('film', 58016),
 ('one', 30981),
 ('make', 27532),
 ('see', 26538),
 ('like', 26363),
 ('get', 21789),
 ('good', 21553),
 ('time', 19026),
 ('watch', 17412)]

In [26]:
def calculate_number_of_short_words(
    cntr
):
    data = []
    for k, v in cntr.items():
        if len(k) < 3:
            data.append((v, k))
    data.sort(reverse=True)
    return data

In [27]:
stemm_short_words = calculate_number_of_short_words(stemm_counter)
lemm_short_words = calculate_number_of_short_words(lemm_counter)

In [28]:
print("Number of stemmed words with length less than two:", len(stemm_short_words))
stemm_short_words[:20]

Number of stemmed words with length less than two: 534


[(11162, 'go'),
 (6299, 'im'),
 (4487, 'us'),
 (3739, 'tv'),
 (3235, 'he'),
 (2411, 'aw'),
 (1979, 'mr'),
 (1804, 'ye'),
 (1794, 'id'),
 (1746, 'oh'),
 (1372, 'ok'),
 (983, 'ad'),
 (915, 'th'),
 (857, 'dr'),
 (813, 'b'),
 (740, 'la'),
 (737, 'de'),
 (537, 'na'),
 (513, 'of'),
 (505, 'ed')]

In [29]:
print("Number of lemmatized words with length less than two:", len(lemm_short_words))
lemm_short_words[:20]

Number of lemmatized words with length less than two: 931


[(16095, 'go'),
 (5953, 'Im'),
 (3878, 'do'),
 (3816, 'u'),
 (3494, 'TV'),
 (2174, 'he'),
 (1671, 'Id'),
 (1631, 'Mr'),
 (1200, 'Oh'),
 (1166, 'OK'),
 (895, 'th'),
 (851, 'US'),
 (824, 'Dr'),
 (678, 'B'),
 (522, 'na'),
 (492, 'oh'),
 (488, 'OF'),
 (464, 'Ed'),
 (427, 'Ms'),
 (424, 'II')]

# Vectorization.

## Count Vectorizer.

The count vectorizer is a customizable SciKit Learn preprocessor method. It works with any text out of the box, and applies preprocessing, tokenization and stop words removal on its own. These tasks can be customized, for example by providing a different tokenization method or stop word list. (This applies to all other preprocessors as well.) Applying the count vectorizer to raw text creates a matrix in the form of (document_id, tokens) in which the values are the token count.

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_x_train = count_vectorizer.fit_transform(reviews_df['lemmatized_review'].apply(lambda x: " ".join(x)))

In [43]:
count_x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
count_vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaaaaaargh', ..., 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz',
       'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz',
       'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'], dtype=object)

## TF-IDF Vectorizer.

The Term Frequency/Inverse Document Frequency is a well-known metric in information retrieval. It encodes word frequencies in such a way as to put equal weight to common terms that occur in many documents, as well as uncommon terms only present in a few documents. This metric generalizes well over large corpora and improves finding relevant topics.

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_x_train = tfidf_vectorizer.fit_transform(reviews_df['lemmatized_review'].apply(lambda x: " ".join(x)))

In [46]:
tfidf_vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaaaaaargh', ..., 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz',
       'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz',
       'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'], dtype=object)

## TF-IDF and Count Vectorizers comparison.

We will look through the following aspects:
1. Number of features recognized by each approach.
2. Shapes of processed datasets.
3. Time required for each approach.

In [40]:
count_x_train.shape

(36770, 117172)

In [41]:
tfidf_x_train.shape

(36770, 117172)

In [47]:
len(tfidf_vectorizer.get_feature_names_out()), len(count_vectorizer.get_feature_names_out())

(117172, 117172)