In [374]:
import os
import re
import nltk
import string
import polars as pl
import transformers

from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from transformers import (TextClassificationPipeline)

# Variable definition for NLP

In [375]:
tokenizer_name = "distilbert-base-uncased"
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
only_english = set(nltk.corpus.words.words())

# List all reviews parquet files to be loaded

In [376]:
path = '../data/raw/reviews'
with os.scandir(path) as reviewFiles:
    reviewFiles = [file.name for file in reviewFiles if file.is_file()]
print(reviewFiles)

['reviews1.parquet', 'reviews2.parquet']


# Functions

In [377]:
def correct_text(text, stem=False, lemma=False, spell=False):
    if lemma and stem:
        raise Exception('Either stem or lemma can be true, not both!')
        return text
    
    sample = text
    
    #removing stopwords
    sample = sample.lower()
    sample = [word for word in sample.split() if not word in stops]
    sample = ' '.join(sample)
    
    if lemma:
        sample = sample.split()
        sample = [lemmatizer.lemmatize(word) for word in sample]
        sample = ' '.join(sample)
        
    if stem:
        sample = sample.split()
        sample = [ps.stem(word) for word in sample]
        sample = ' '.join(sample)
    
    if spell:
        sample = str(TextBlob(text).correct())
    
    return sample

In [378]:
def clean_text(text):
    
    sample = text
    sample = " ".join([x.lower() for x in sample.split()])
    sample = re.sub(r"\S*https?:\S*", '', sample) #links and urls
    sample = re.sub('\[.*?\]', '', sample) #text between [square brackets]
    sample = re.sub('\(.*?\)', '', sample) #text between (parenthesis)
    sample = re.sub('[%s]' % re.escape(string.punctuation), '', sample) #punctuations
    sample = re.sub('\w*\d\w', '', sample) #digits with trailing or preceeding text
    sample = re.sub(r'\n', ' ', sample) #new line character
    sample = re.sub(r'\\n', ' ', sample) #new line character
    sample = re.sub("[''""...“”‘’…]", '', sample) #list of quotation marks
    sample = re.sub(r', /<[^>]+>/', '', sample)    #HTML attributes
    
    sample = ' '.join([w for w in nltk.wordpunct_tokenize(sample) if w.lower() in only_english or not w.isalpha()]) #doesn't remove indian languages
    sample = ' '.join(list(filter(lambda ele: re.search("[a-zA-Z\s]+", ele) is not None, sample.split()))) #languages other than english
    
    sample = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', sample) #emojis and symbols
    sample = sample.strip()
    sample = " ".join([x.strip() for x in sample.split()])
    
    return sample

In [379]:
def get_sentiment(r):
    tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(tokenizer_name)
    trn = transformers.DistilBertForSequenceClassification.from_pretrained(model_name).cpu()
    pipe = TextClassificationPipeline(model=trn, tokenizer=tokenizer, return_all_scores=True)
    sentiment = pipe(r)
    neg = sentiment[0][0]
    pos = sentiment[0][1]
    return neg['score'], pos['score']


# Load all parquet files in the directory

In [380]:
df = pl.DataFrame()
for f in reviewFiles:
    n = str.format("../data/raw/reviews/{}", f)
    print("reading " + n)

    df_aux = pl.read_parquet(n, n_rows=5).drop(["asin", "reviewerID", "reviewerName", "unixReviewTime", "style", "vote", "image"])

    print("Adding...")
    df = df.vstack(df_aux)
    n = ""


reading ../data/raw/reviews/reviews1.parquet
Adding...
reading ../data/raw/reviews/reviews2.parquet
Adding...


In [381]:
df.head()

overall,reviewText,summary,verified
str,str,str,str
"""5.0""","""Crazy Taxi is …","""It's Party Tim…","""false"""
"""4.0""","""I love these p…","""Durable, good …","""true"""
"""2.0""","""Anyone who has…","""A fun game tha…","""false"""
"""5.0""","""Great pants an…","""Great Product""","""true"""
"""4.0""","""In this game y…","""Fun and Entert…","""false"""


# Preprocessing

In [382]:
# se cambia el tipo de datos de la colu,na overall
df.with_columns([pl.col("overall").cast(pl.Float64)])
df.sample(1)

overall,reviewText,summary,verified
str,str,str,str
"""5.0""","""This is a most…","""Very unique id…","""false"""


In [383]:
# nuevas columnas numericas
df = df.with_columns(( pl.col("reviewText").apply(lambda x: len(str(x).split(" ")) ) ).alias("cant_words_in_review"))
df = df.with_columns(( pl.col("summary").apply(lambda x: len(str(x).split(" ")) ) ).alias("cant_words_in_sumary"))
df.sample(1)


overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary
str,str,str,str,i64,i64
"""5.0""","""This is a most…","""Very unique id…","""false""",56,3


# Prepare review column to determinate the sentiment

In [384]:
df = df.with_columns(( pl.col("reviewText").apply(lambda x: clean_text(x)) ).alias("cleaned_text"))
df = df.with_columns(( pl.col("cleaned_text").apply(lambda x: correct_text(x)) ).alias("correct_text"))
df.sample(1)

overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text
str,str,str,str,i64,i64,str,str
"""4.0""","""In this game y…","""Fun and Entert…","""false""",99,3,"""in this game y…","""game choose dr…"


# Determinate sentiment of the review

In [385]:
df = df.with_columns(( pl.col("correct_text").apply(lambda x: get_sentiment(x)) ).alias("sentiment"))
df.sample(1)



overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text,sentiment
str,str,str,str,i64,i64,str,str,list[f64]
"""1.0""","""This was a Chr…","""I would recomm…","""false""",54,6,"""this was a gif…","""gift year old …","[0.927894, 0.072106]"


In [386]:
df = df.with_columns(( pl.col("sentiment").apply(lambda x: x[0]) ).alias("sentiment_negative"))
df = df.with_columns(( pl.col("sentiment").apply(lambda x: x[1]) ).alias("sentiment_positive"))
df.sample(2)

overall,reviewText,summary,verified,cant_words_in_review,cant_words_in_sumary,cleaned_text,correct_text,sentiment,sentiment_negative,sentiment_positive
str,str,str,str,i64,i64,str,str,list[f64],f64,f64
"""5.0""","""Great pants an…","""Great Product""","""true""",5,2,"""great pants an…","""great pants ni…","[0.000467, 0.999533]",0.000467,0.999533
"""4.0""","""I love these p…","""Durable, good …","""true""",87,8,"""i love these p…","""love pants us …","[0.00074, 0.99926]",0.00074,0.99926


In [393]:
#usar to_dummies de polars

<class 'polars.series.series.Series'>


verified
str
"""false"""
"""true"""
"""false"""
"""true"""
"""false"""
"""true"""
"""true"""
"""false"""
"""false"""
"""true"""


# Create X y 

In [None]:
# X = df.drop_in_place(["overall", ])
# y = df["overall"]