In [142]:
import os
import re
import nltk
import string
import polars as pl
import transformers

from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from transformers import (TextClassificationPipeline)

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import MinMaxScaler

# Variable definition for NLP

In [143]:
tokenizer_name = "distilbert-base-uncased"
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(tokenizer_name)
trn = transformers.DistilBertForSequenceClassification.from_pretrained(model_name).cpu()
pipe = TextClassificationPipeline(model=trn, tokenizer=tokenizer, return_all_scores=True)

stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
only_english = set(nltk.corpus.words.words())



# List all reviews parquet files to be loaded

In [144]:
path = '../data/raw/reviews'
with os.scandir(path) as reviewFiles:
    reviewFiles = [file.name for file in reviewFiles if file.is_file()]
print(reviewFiles)

['reviews1.parquet', 'reviews10.parquet', 'reviews11.parquet', 'reviews12.parquet', 'reviews13.parquet', 'reviews14.parquet', 'reviews15.parquet', 'reviews16.parquet', 'reviews2.parquet']


# Functions

In [145]:
def correct_text(text, stem=False, lemma=False, spell=False):
    if lemma and stem:
        raise Exception('Either stem or lemma can be true, not both!')
        return text
    
    sample = text
    
    #removing stopwords
    sample = sample.lower()
    sample = [word for word in sample.split() if not word in stops]
    sample = ' '.join(sample)
    
    if lemma:
        sample = sample.split()
        sample = [lemmatizer.lemmatize(word) for word in sample]
        sample = ' '.join(sample)
        
    if stem:
        sample = sample.split()
        sample = [ps.stem(word) for word in sample]
        sample = ' '.join(sample)
    
    if spell:
        sample = str(TextBlob(text).correct())
    
    return sample

In [146]:
def clean_text(text):
    
    sample = text
    sample = " ".join([x.lower() for x in sample.split()])
    sample = re.sub(r"\S*https?:\S*", '', sample) #links and urls
    sample = re.sub('\[.*?\]', '', sample) #text between [square brackets]
    sample = re.sub('\(.*?\)', '', sample) #text between (parenthesis)
    sample = re.sub('[%s]' % re.escape(string.punctuation), '', sample) #punctuations
    sample = re.sub('\w*\d\w', '', sample) #digits with trailing or preceeding text
    sample = re.sub(r'\n', ' ', sample) #new line character
    sample = re.sub(r'\\n', ' ', sample) #new line character
    sample = re.sub("[''""...“”‘’…]", '', sample) #list of quotation marks
    sample = re.sub(r', /<[^>]+>/', '', sample)    #HTML attributes
    
    sample = ' '.join([w for w in nltk.wordpunct_tokenize(sample) if w.lower() in only_english or not w.isalpha()]) #doesn't remove indian languages
    sample = ' '.join(list(filter(lambda ele: re.search("[a-zA-Z\s]+", ele) is not None, sample.split()))) #languages other than english
    
    sample = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', sample) #emojis and symbols
    sample = sample.strip()
    sample = " ".join([x.strip() for x in sample.split()])
    
    return sample

In [147]:
def get_sentiment(r):
    sentiment = pipe(r)
    neg = sentiment[0][0]
    pos = sentiment[0][1]
    return neg['score'], pos['score']


In [148]:
def min_max_scaling(x, col):
    return (x - X[col].min() ) / ( X[col].max() - X[col].min()) 

# Load all parquet files in the directory

In [149]:
df = pl.DataFrame()
for f in reviewFiles:
    n = str.format("../data/raw/reviews/{}", f)
    print("reading " + n)

    df_aux = pl.read_parquet(n, n_rows=3000).drop(["asin", "reviewerID", "reviewerName", "unixReviewTime", "style", "vote", "image"])

    print("Adding..." + n)
    df = df.vstack(df_aux)
    n = ""


reading ../data/raw/reviews/reviews1.parquet
Adding...../data/raw/reviews/reviews1.parquet
reading ../data/raw/reviews/reviews10.parquet
Adding...../data/raw/reviews/reviews10.parquet
reading ../data/raw/reviews/reviews11.parquet
Adding...../data/raw/reviews/reviews11.parquet
reading ../data/raw/reviews/reviews12.parquet
Adding...../data/raw/reviews/reviews12.parquet
reading ../data/raw/reviews/reviews13.parquet
Adding...../data/raw/reviews/reviews13.parquet
reading ../data/raw/reviews/reviews14.parquet
Adding...../data/raw/reviews/reviews14.parquet
reading ../data/raw/reviews/reviews15.parquet
Adding...../data/raw/reviews/reviews15.parquet
reading ../data/raw/reviews/reviews16.parquet
Adding...../data/raw/reviews/reviews16.parquet
reading ../data/raw/reviews/reviews2.parquet
Adding...../data/raw/reviews/reviews2.parquet


In [150]:
df.head()

overall,reviewText,summary,verified
str,str,str,str
"""5.0""","""Crazy Taxi is …","""It's Party Tim…","""false"""
"""4.0""","""I love these p…","""Durable, good …","""true"""
"""2.0""","""Anyone who has…","""A fun game tha…","""false"""
"""5.0""","""Great pants an…","""Great Product""","""true"""
"""4.0""","""In this game y…","""Fun and Entert…","""false"""


In [151]:
print(type(df))

<class 'polars.dataframe.frame.DataFrame'>


In [152]:
df.shape

(27000, 4)

In [153]:
df.null_count()

overall,reviewText,summary,verified
u32,u32,u32,u32
0,18,6,0


In [154]:
df = df.drop_nulls()

In [155]:
df.shape

(26977, 4)

# Preprocessing

In [156]:
# nuevas columnas numericas
df = df.with_columns(( pl.col("reviewText").apply(lambda x: len(str(x).split(" ")) ) ).alias("cant_words_in_review"))
df = df.with_columns(( pl.col("summary").apply(lambda x: len(str(x).split(" ")) ) ).alias("cant_words_in_sumary"))
df.sample(1)


overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary
str,str,str,str,i64,i64
"""4.0""","""The sex parts …","""Four Stars""","""true""",14,2


In [157]:
df.describe()

describe,overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary
str,str,str,str,str,f64,f64
"""count""","""26977""","""26977""","""26977""","""26977""",26977.0,26977.0
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0
"""mean""",,,,,54.014494,4.37139
"""std""",,,,,101.310756,3.677341
"""min""","""1.0""",""" 5 STAR REVIEW…",""" Yummy""","""false""",1.0,1.0
"""max""","""5.0""","""~I received a …","""~~~GameCube is…","""true""",3251.0,31.0
"""median""",,,,,24.0,3.0
"""25%""",,,,,8.0,2.0
"""75%""",,,,,56.0,6.0


In [158]:
df = df.filter(pl.any(pl.col('cant_words_in_review') < 512))

  df = df.filter(pl.any(pl.col('cant_words_in_review') < 512))


In [159]:
df.describe()

describe,overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary
str,str,str,str,str,f64,f64
"""count""","""26753""","""26753""","""26753""","""26753""",26753.0,26753.0
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0
"""mean""",,,,,47.941165,4.344522
"""std""",,,,,69.541504,3.659594
"""min""","""1.0""",""" 5 STAR REVIEW…",""" Yummy""","""false""",1.0,1.0
"""max""","""5.0""","""~I received a …","""~~~GameCube is…","""true""",511.0,31.0
"""median""",,,,,23.0,3.0
"""25%""",,,,,8.0,2.0
"""75%""",,,,,55.0,6.0


# Prepare review column to determinate the sentiment

In [160]:
df = df.with_columns(( pl.col("reviewText").apply(lambda x: clean_text(x)) ).alias("cleaned_text"))
df = df.with_columns(( pl.col("cleaned_text").apply(lambda x: correct_text(x)) ).alias("correct_text"))
df.sample(1)

overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text
str,str,str,str,i64,i64,str,str
"""5.0""","""Loved this boo…","""Read this book…","""true""",133,3,"""this book its …","""book nonstop a…"


# Determinate sentiment of the review

In [161]:
df = df.with_columns(( pl.col("correct_text").apply(lambda x: get_sentiment(x)) ).alias("sentiment"))
df.sample(1)

overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text,sentiment
str,str,str,str,i64,i64,str,str,list[f64]
"""4.0""","""I like gun fir…","""Four Stars""","""true""",7,2,"""i like gun fir…","""like gun fire …","[0.889135, 0.110865]"


In [162]:
df = df.with_columns(( pl.col("sentiment").apply(lambda x: x[0]) ).alias("sentiment_negative"))
df = df.with_columns(( pl.col("sentiment").apply(lambda x: x[1]) ).alias("sentiment_positive"))
df.sample(1)

overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text,sentiment,sentiment_negative,sentiment_positive
str,str,str,str,i64,i64,str,str,list[f64],f64,f64
"""5.0""","""Morgan Llywely…","""Another brilli…","""false""",67,5,"""morgan done it…","""morgan done wo…","[0.000158, 0.999842]",0.000158,0.999842


# Encode verified column

In [163]:
df = df.to_dummies("verified")

# Delete no numerical columns

In [164]:
df = df.drop(["reviewText", "summary", "cleaned_text", "correct_text", "sentiment"] )
df.sample(1)

overall,verified_false,verified_true,cant_words_in_review,cant_words_in_sumary,sentiment_negative,sentiment_positive
str,u8,u8,i64,i64,f64,f64
"""5.0""",0,1,6,2,0.000234,0.999766


In [165]:
# se cambia el tipo de datos de la columna overall
df = df.with_columns([pl.col("overall").cast(pl.Float64)])
df.sample(1)

overall,verified_false,verified_true,cant_words_in_review,cant_words_in_sumary,sentiment_negative,sentiment_positive
f64,u8,u8,i64,i64,f64,f64
5.0,0,1,26,12,0.005367,0.994633


# Create X y 

In [166]:
y = df["overall"]
X= df.drop("overall")

In [167]:
X.describe()

describe,verified_false,verified_true,cant_words_in_review,cant_words_in_sumary,sentiment_negative,sentiment_positive
str,f64,f64,f64,f64,f64,f64
"""count""",26753.0,26753.0,26753.0,26753.0,26753.0,26753.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",0.20764,0.79236,47.941165,4.344522,0.292045,0.707955
"""std""",0.405625,0.405625,69.541504,3.659594,0.425459,0.425459
"""min""",0.0,0.0,1.0,1.0,0.000108,0.000182
"""max""",1.0,1.0,511.0,31.0,0.999818,0.999892
"""median""",0.0,1.0,23.0,3.0,0.004247,0.995753
"""25%""",0.0,1.0,8.0,2.0,0.000258,0.121261
"""75%""",0.0,1.0,55.0,6.0,0.878739,0.999742


# Scaling the cant_words_in_review and cant_words_in_sumary columns

In [168]:
X = X.with_columns(( pl.col("cant_words_in_review")
                    .apply(lambda x: min_max_scaling(x, "cant_words_in_review") ) )
                    .alias("cant_words_in_review_scaled"))

X = X.with_columns(( pl.col("cant_words_in_sumary")
                    .apply(lambda x: min_max_scaling(x, "cant_words_in_sumary") ) )
                    .alias("cant_words_in_sumary_scaled"))
X.sample(1)


verified_false,verified_true,cant_words_in_review,cant_words_in_sumary,sentiment_negative,sentiment_positive,cant_words_in_review_scaled,cant_words_in_sumary_scaled
u8,u8,i64,i64,f64,f64,f64,f64
0,1,13,3,0.000455,0.999545,0.023529,0.066667


In [169]:
X = X.drop(["cant_words_in_review", "cant_words_in_sumary", "verified_true"])

In [170]:
# chequeando skew
for i in X.columns:
    print(X.select(pl.col(i).skew()))

shape: (1, 1)
┌────────────────┐
│ verified_false │
│ ---            │
│ f64            │
╞════════════════╡
│ 1.441553       │
└────────────────┘
shape: (1, 1)
┌────────────────────┐
│ sentiment_negative │
│ ---                │
│ f64                │
╞════════════════════╡
│ 0.914655           │
└────────────────────┘
shape: (1, 1)
┌────────────────────┐
│ sentiment_positive │
│ ---                │
│ f64                │
╞════════════════════╡
│ -0.914655          │
└────────────────────┘
shape: (1, 1)
┌─────────────────────────────┐
│ cant_words_in_review_scaled │
│ ---                         │
│ f64                         │
╞═════════════════════════════╡
│ 3.041991                    │
└─────────────────────────────┘
shape: (1, 1)
┌─────────────────────────────┐
│ cant_words_in_sumary_scaled │
│ ---                         │
│ f64                         │
╞═════════════════════════════╡
│ 2.001908                    │
└─────────────────────────────┘


In [171]:
# chequeando multicolinearity
import pandas as pd
import pyarrow as pa

x_pd = pd.DataFrame()
vif = pd.DataFrame()

x_pd = X.to_pandas()
vif["variables"] = x_pd.columns
vif["VIF"] = [variance_inflation_factor(x_pd.values, i) for i in range(x_pd.shape[1])]
vif


Unnamed: 0,variables,VIF
0,verified_false,1.564802
1,sentiment_negative,1.545145
2,sentiment_positive,1.596611
3,cant_words_in_review_scaled,1.96916
4,cant_words_in_sumary_scaled,1.996974


# Saving processed files

In [172]:
X.write_csv("../data/processed/X.csv")
y.to_frame().write_csv("../data/processed/y.csv")