In [96]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory
import spacy
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import  resample
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import  Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [61]:
def load_dataframe(filename):
    # Initialize DataFrame
    df = pd.read_csv(filename)
    
    return df

df = load_dataframe("amazon_uk_shoes_products_dataset_2021.csv")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5.0,True,Reviewed in Italy on 2 April 2021,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/2021 02:26:25


In [62]:
df = df.dropna()  # Get ride of missing reviews

df.isna().sum().sum()  # Print missing values (0)


0

In [63]:
def get_sentiment(review):
    # Classifies the sentiment of the text into positive, negative or neutral using the review ratings
    if review >= 2.5:
        return 1
    else:
        return 0

def classify_sentiment(dataframe, review_col):
    """
    Takes the review and classifies a positive or negative sentiment.
    Inserts the sentiment in a new column.
    Returns the dataframe
    """
    dataframe["sentiment"] = dataframe[review_col].apply(get_sentiment) # Classify sentiment of tokens and insert into column
    
    return dataframe

df = classify_sentiment(df, "review_rating")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25,1
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5.0,True,Reviewed in Italy on 2 April 2021,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/2021 02:26:25,1
6,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Cliente,Molto belle,"Le scarpe sono molto belle, calzano perfettamente",5.0,True,Reviewed in Italy on 8 April 2021,2 people found this helpful,deb5e278-70b5-5e2c-9ad7-93bf5c26a41d,24/12/2021 02:26:25,1
14,https://www.amazon.co.uk/dp/B07S1XM3L7,"adidas Women's Retrorun Shoes Running, Core Bl...",Lindsay,Perfect right outta the box,True to size. If between I'd probably go with ...,5.0,True,Reviewed in Canada on 20 October 2021,One person found this helpful,b64632c5-6f24-51eb-9275-6614fed29f1a,24/12/2021 02:26:25,1
17,https://www.amazon.co.uk/dp/B0125TMZGK,"Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK",Amazon Customer,Comfortable and attractive,I have hard to fit feet and often a wide fitti...,5.0,True,Reviewed in Canada on 8 October 2018,2 people found this helpful,bce0114a-c0fe-5472-bbb8-377cb21dc853,24/12/2021 02:26:25,1


In [None]:
print(f"Target value count: {df["sentiment"].value_counts()}")  # Check balance of target values

def random_oversampling(df:pd.DataFrame, target:str):
    # Reduces the number of samples from the majority class
    
    df_majority = df[df[target] == 0]  # Majority class (low risk)
    df_minority = df[df[target] == 1]  # Minority class (high risk)
    
    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
        n_samples=len(df_minority),  # Match minority class size
        random_state=42)

    # Combine downsampled majority class with minority class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Balanced target ratio: {df_balanced[target].value_counts()}")
    
    return df_balanced

df = random_oversampling(df, "sentiment")

_, df = train_test_split(df, stratify=df["sentiment"], random_state=42, test_size=200)  # Sample 200 records

Target value count: sentiment
1    1523
0     427
Name: count, dtype: int64
Balanced target ratio: sentiment
0    1523
1    1523
Name: count, dtype: int64


In [66]:
DetectorFactory.seed = 0 # Ensure consistent results for langauge classification

def is_english(text):
    # Checks if a text is English or not. Returns True if it is English, False if otherwise.
    try:
        return detect(text) == "en" # Checks if text is English
    except:
        return False # In case of error such as empty string

def filter_english(dataframe, text_col):
    """
    Filters out any rows that contain Non-English language.
    """
    dataframe['is_english'] = dataframe[text_col].apply(is_english) # Create new boolean column to classify if the text is english
    english_df = dataframe[dataframe['is_english']] # New DataFrame that only has rows with english values
    
    english_df = english_df.drop(columns=["is_english"]) # Drop is_english column
    
    return english_df

df = filter_english(df, "review_text")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable. I have wide feet and fl...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,"I,purchased these primarily because of the new...",1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0


In [67]:
# Text Cleaning and Regular Expression
def regex(text):
    """
    Applies regular expression to a text to remove punctuation marks
    """
    text = re.sub(r'[^\w\s]', "", text) # Replace punctuation marks with empty string
    text = re.sub(r'[\s+]', " ", text) # Replace multiple spaces with one space
    
    return text.strip()
    
def clean_text(dataframe, text_col):
    """
    Ensures data is consistent and removes punctuation for better model performance.
    """
    dataframe = dataframe.dropna(subset=[text_col]) # Remove rows with missing values in text column
    
    dataframe[text_col] = dataframe[text_col].apply(regex) # Remove punctuation marks
    
    return dataframe

df = clean_text(df, "review_text")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0


In [68]:
# Tokenizaton
def tokenize(text):
    # Tokenizes a text and returns the tokens
    
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    
    doc = nlp(text) # Process the text
    tokens = [token.text for token in doc] # Stores the tokens
    
    return tokens

def tokenize_words(dataframe, text_col):
    """
    Tokenizes every row in the text column down. Creates a new column containing tokenized words.
    Returns the new DataFrame.
    """
    dataframe["tokenized_words"] = dataframe[text_col].apply(tokenize) # Tokenize words and add to new column
    
    return dataframe

df = tokenize_words(df, "review_text")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment,tokenized_words
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1,"[Extremely, comfortable, I, have, wide, feet, ..."
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0,"[size, indicators, very, misleading, this, sho..."
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0,"[I, was, hoping, for, good, arch, support, and..."
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0,"[I, got, these, to, replace, my, Gel, Excite, ..."
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0,"[Ipurchased, these, primarily, because, of, th..."


In [69]:
# Stop Word Removal
def stop_word_filter(tokens):
    # Removes stop words from an array of tokens and returns the filtered tokens
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    stop_words = nlp.Defaults.stop_words # Create a list of stop words
    
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words] # Remove stop words
    
    return filtered_tokens

def remove_stop_words(dataframe, token_col):
    """
    Removes stop words from tokens array and returns the DataFrame.
    """
    dataframe[token_col] = dataframe[token_col].apply(stop_word_filter)
    
    return dataframe

df = remove_stop_words(df, "tokenized_words")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment,tokenized_words
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1,"[Extremely, comfortable, wide, feet, flat, fee..."
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0,"[size, indicators, misleading, shoe, approxima..."
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0,"[hoping, good, arch, support, feet, hurting, c..."
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0,"[got, replace, Gel, Excite, 7s, quiz, Asics, w..."
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0,"[Ipurchased, primarily, new, balance, , GREAT..."


In [70]:
# Lemmatization
def lemmatize(tokens):
    """
    Lemmatizes the text and returns the lemmatized words.
    """
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    
    text = " ".join(tokens) # Create a text version of the tokens
    doc = nlp(text) # Process the text
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    return lemmatized_tokens

def lemmatize_tokens(dataframe, token_col):
    """
    Lemmatizes the tokens in the token column and returns the DataFrame.
    """
    dataframe[token_col] = dataframe[token_col].apply(lemmatize)
    
    return dataframe

df = lemmatize_tokens(df, "tokenized_words")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment,tokenized_words
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1,"[extremely, comfortable, wide, foot, flat, foo..."
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0,"[size, indicator, mislead, shoe, approximately..."
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0,"[hope, good, arch, support, foot, hurt, couple..."
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0,"[get, replace, Gel, Excite, 7s, quiz, Asics, w..."
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0,"[ipurchase, primarily, new, balance, , GREAT..."


In [71]:
# Vectorization
def vectorize(tokens):
    # Vectorizes the array of tokens and returns the array of vectors.
    nlp = spacy.load("en_core_web_lg") # Create NLP Pipeline
    
    text = " ".join(tokens) # Create a text version of the tokens
    doc = nlp(text) # Process the text
    
    vectorized_tokens = [doc.vector] # Vectorize tokens and insert into a list
    
    return vectorized_tokens
    
def vectorize_tokens(dataframe, token_col):
    """
    Takes tokens and provides a new column containing their vectors. 
    Returns the DataFrame.
    """
    nlp = spacy.load("en_core_web_lg") # Create NLP Pipeline
    
    dataframe["Vectors"] = dataframe[token_col].apply(vectorize) # Vectorize tokens and add to Vectors column
    
    return dataframe

df = vectorize_tokens(df, "tokenized_words")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment,tokenized_words,Vectors
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1,"[extremely, comfortable, wide, foot, flat, foo...","[[0.4017035, 2.2142365, -3.7027583, 1.3324287,..."
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0,"[size, indicator, mislead, shoe, approximately...","[[-1.135092, 0.15613683, -3.836133, 3.7433767,..."
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0,"[hope, good, arch, support, foot, hurt, couple...","[[-0.25005546, 1.89189, -3.4961765, 0.05091668..."
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0,"[get, replace, Gel, Excite, 7s, quiz, Asics, w...","[[0.095304824, 2.366934, -2.627392, 0.4291453,..."
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0,"[ipurchase, primarily, new, balance, , GREAT...","[[-0.6260551, 0.7344375, -1.2596749, 1.1397091..."


In [72]:
def get_model_data(dataframe, cols):
    """
    Drops all columns that are not specified in the cols array and returns the DataFrame.
    """
    new_dataframe = dataframe[cols]
    
    return new_dataframe

cols = ["Vectors", "sentiment"]  # Define X and y features to isolate
model_df = get_model_data(df, cols)

model_df.head()

Unnamed: 0,Vectors,sentiment
1192,"[[0.4017035, 2.2142365, -3.7027583, 1.3324287,...",1
338,"[[-1.135092, 0.15613683, -3.836133, 3.7433767,...",0
1088,"[[-0.25005546, 1.89189, -3.4961765, 0.05091668...",0
866,"[[0.095304824, 2.366934, -2.627392, 0.4291453,...",0
1512,"[[-0.6260551, 0.7344375, -1.2596749, 1.1397091...",0


## Sentiment Analysis

In [73]:
def split_data(X, y):
    """
    Takes X and y variables and splits into train and test data.
    Returns X and y train and test data.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.75, test_size=0.25) # Split data into train and test
    
    return X_train, X_test, y_train, y_test

# Define X and y
X = model_df["Vectors"]
y = model_df["sentiment"]

X_train, X_test, y_train, y_test = split_data(X, y)  # Split data into train and test

In [93]:
def lstm_model(X_train, X_test, y_train, y_test):
    """
    Creates an LSTM model and returns the predicted y values.
    Makes use of padding to ensure that the sequences are of equal length.
    Returns the predicted classifications of the values.
    """
    # Ensure train and test data are numpy arrays
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    
    max_sequence_length = 20 # Define max length for padding
    # Pad the sequences of train and test data
    X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length, padding="post", truncating="post", value=0.0)
    X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length, padding="post", truncating="post", value=0.0)
    print(f"X_train_padded: {X_train_padded}")
    
    model = Sequential() # Initialize model
    # Add LSTM model
    model.add(LSTM(128, input_shape=(X_train_padded.shape[1], X_train_padded.shape[2]), return_sequences=False))
    
    model.add(Dropout(0.5)) # Add dropout layer to prevent overfitting
    model.add(Dense(1, activation="sigmoid")) # Output layer
    
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy']) # Compile model
    model.fit(X_train_padded, y_train, epochs=200, batch_size=64, validation_data=(X_test_padded, y_test)) # Fit the model
    
    y_pred_prob = model.predict(X_test_padded) # Predict probability of positive classification
    y_pred = [int(pred >= 0.5) for pred in y_pred_prob] # Get a list of the classified values
    
    return y_pred

lstm_y_pred = lstm_model(X_train, X_test, y_train, y_test)  # Get LSTM Model predictions

X_train_padded: [[[-2  1 -3 ...  0 -2  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  0 -1 ... -1 -1  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  2 -2 ...  0 -3  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 ...

 [[ 0  0 -2 ...  0 -1  1]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  1  0 ...  0 -1  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[ 0  1 -2 ...  0 -1  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0 

  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 189ms/step - accuracy: 0.4290 - loss: 0.6935 - val_accuracy: 0.4737 - val_loss: 0.6924
Epoch 2/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.5231 - loss: 0.6914 - val_accuracy: 0.5526 - val_loss: 0.6908
Epoch 3/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.6206 - loss: 0.6877 - val_accuracy: 0.7368 - val_loss: 0.6874
Epoch 4/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.7168 - loss: 0.6831 - val_accuracy: 0.6579 - val_loss: 0.6794
Epoch 5/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.7782 - loss: 0.6717 - val_accuracy: 0.6842 - val_loss: 0.6609
Epoch 6/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.8535 - loss: 0.6371 - val_accuracy: 0.6842 - val_loss: 0.6226
Epoch 7/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[

  y_pred = [int(pred >= 0.5) for pred in y_pred_prob] # Get a list of the classified values


## Analyse Results

In [94]:
clf_report = classification_report(y_test, lstm_y_pred)

print(f"Classificaton Report:\n\n{clf_report}")

Classificaton Report:

              precision    recall  f1-score   support

           0       0.89      0.80      0.84        20
           1       0.80      0.89      0.84        18

    accuracy                           0.84        38
   macro avg       0.84      0.84      0.84        38
weighted avg       0.85      0.84      0.84        38



## Aspect Based Sentiment Analysis


In [99]:
nlp = spacy.load("en_core_web_lg") # Load NLP Pipeline

def absa_model(text):
    """
    Takes in text and determines the text's aspects and the opinions associated with the aspects.
    Uses a list of words and sentiment polarity to classify the sentiment of the opinion.
    Aspects are determined as noun chunks.
    Opinions are determined as adjectives or adverbs.
    Returns a list with a tuple that contains the aspect, opinion and sentiment of the opinion.
    """
    # Define positive and negative words
    positive_words = ["amazing", "good", "great", "excellent", "fantastic", "positive", "happy"]
    negative_words = ["poor", "bad", "terrible", "horrible", "negative", "unhappy", "disappointing"]
    
    doc = nlp(text) # Process the text
    # Initialize lists for results
    aspects, opinions, sentiments = [], [], []
    
    # Get aspects by taking the noun chunks
    for chunk in doc.noun_chunks:
        aspect = chunk
        aspects.append(aspect)
        
    # Generate opinions by getting adjectives or adverbs
    opinions = [token.text for token in doc if token.pos_ in ["ADJ", "ADV"]]
    
    # Get Sentiment of each opinion
    for opinion in opinions:
        if opinion in positive_words:
            sentiment = "positive"
        elif opinion in negative_words:
            sentiment = "negative"
        else: # Use textblob for polarity
            sentiment = "positive" if TextBlob(opinion).sentiment.polarity >= 0 else "negative"
        
        sentiments.append(sentiment) # Add sentiment to the list
    
    return list(zip(aspects, opinions, sentiments))

df["ABSA"] = df["review_text"].apply(absa_model)

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at,sentiment,tokenized_words,Vectors,ABSA
1192,https://www.amazon.co.uk/dp/B01MG4H26L,"SAS Men's, Journey Mesh Walking Sneakers Gray ...",Nathan B. Starr,Most comfy walking shoes I've ever owned.,Extremely comfortable I have wide feet and fla...,5.0,True,Reviewed in the United States on 13 August 2019,One person found this helpful,e745dfd8-2d13-5217-9309-7c7fe4079bb9,24/12/2021 02:26:59,1,"[extremely, comfortable, wide, foot, flat, foo...","[[0.4017035, 2.2142365, -3.7027583, 1.3324287,...","[((I), Extremely, negative), ((wide, feet), co..."
338,https://www.amazon.co.uk/dp/B01MYWLW8X,PUMA Women's Sky II Hi Explosive Gray Violet/G...,zucow,Two Stars,size indicators very misleading this shoe appr...,2.0,True,Reviewed in the United States on 8 October 2017,One person found this helpful,8be4f767-699a-51d8-8166-fe118e8df32b,24/12/2021 02:28:10,0,"[size, indicator, mislead, shoe, approximately...","[[-1.135092, 0.15613683, -3.836133, 3.7433767,...","[((size, indicators), very, positive), ((this,..."
1088,https://www.amazon.co.uk/dp/B07VCTMM58,"Merrell Women's J033324 Hiking Shoe, Aluminum,...",musical_one,Not enough arch support if you need that,I was hoping for good arch support and these d...,2.0,True,Reviewed in the United States on 28 September ...,10 people found this helpful,ec6c1714-00b6-595f-b880-b56c0c690459,24/12/2021 02:28:46,0,"[hope, good, arch, support, foot, hurt, couple...","[[-0.25005546, 1.89189, -3.4961765, 0.05091668...","[((I), good, positive), ((good, arch, support)..."
866,https://www.amazon.co.uk/dp/B082Q375G1,"ASICS - Womens Roadblast Shoes, 7.5 UK, Black/...",Sarah Weaver,Worst Asics Ever,I got these to replace my Gel Excite 7s after ...,1.0,True,Reviewed in the United States on 3 August 2021,One person found this helpful,efe73d5f-84c9-5626-a683-b354b4e761cc,24/12/2021 02:27:15,0,"[get, replace, Gel, Excite, 7s, quiz, Asics, w...","[[0.095304824, 2.366934, -2.627392, 0.4291453,...","[((I), best, positive), ((these), more, positi..."
1512,https://www.amazon.co.uk/dp/B019EEE84Q,"New Balance Mens MW1400 V1 Shoes, 8 UK - Width...",Jim Sliwa,I am GREATLY disappointed. The soles of the bo...,Ipurchased these primarily because of the new ...,1.0,True,Reviewed in the United States on 27 June 2018,2 people found this helpful,a470c63a-aabd-54b0-a2e1-6491335d7d5e,24/12/2021 02:27:02,0,"[ipurchase, primarily, new, balance, , GREAT...","[[-0.6260551, 0.7344375, -1.2596749, 1.1397091...","[((Ipurchased), primarily, positive), ((these)..."
