In [None]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory
import spacy
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from keras.models import  Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
def load_dataframe(filename):
    # Initialize DataFrame
    df = pd.read_csv(filename)
    
    return df

df = load_dataframe("amazon_uk_shoes_products_dataset_2021.csv")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2.0,True,Reviewed in the United States on 28 October 2021,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/2021 02:26:25
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5.0,True,Reviewed in the United States on 20 January 2021,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/2021 02:26:25
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5.0,True,Reviewed in the United States on 22 April 2021,,75a42851-6462-54b5-988a-27d336221943,24/12/2021 02:26:25
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5.0,True,Reviewed in Italy on 2 April 2021,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/2021 02:26:25


In [None]:
df = df.dropna()  # Get ride of missing reviews

df.isna().sum().sum()  # Print missing values (0)


In [8]:
DetectorFactory.seed = 0 # Ensure consistent results for langauge classification

def is_english(text):
    # Checks if a text is English or not. Returns True if it is English, False if otherwise.
    try:
        return detect(text) == "en" # Checks if text is English
    except:
        return False # In case of error such as empty string

def filter_english(dataframe, text_col):
    """
    Filters out any rows that contain Non-English language.
    """
    dataframe['is_english'] = dataframe[text_col].apply(is_english) # Create new boolean column to classify if the text is english
    english_df = dataframe[dataframe['is_english']] # New DataFrame that only has rows with english values
    
    english_df = english_df.drop(columns=["is_english"]) # Drop is_english column
    
    return english_df

df = filter_english(df, "review_text")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25
14,https://www.amazon.co.uk/dp/B07S1XM3L7,"adidas Women's Retrorun Shoes Running, Core Bl...",Lindsay,Perfect right outta the box,True to size. If between I'd probably go with ...,5.0,True,Reviewed in Canada on 20 October 2021,One person found this helpful,b64632c5-6f24-51eb-9275-6614fed29f1a,24/12/2021 02:26:25
17,https://www.amazon.co.uk/dp/B0125TMZGK,"Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK",Amazon Customer,Comfortable and attractive,I have hard to fit feet and often a wide fitti...,5.0,True,Reviewed in Canada on 8 October 2018,2 people found this helpful,bce0114a-c0fe-5472-bbb8-377cb21dc853,24/12/2021 02:26:25
21,https://www.amazon.co.uk/dp/B0125TMZGK,"Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK",Burger Lover - Dalton,Great quality and comfort shoes,Great quality and comfort shoesI was so thrill...,5.0,True,Reviewed in the United States on 27 August 2020,2 people found this helpful,f1bafbd5-ff6a-52c7-8929-68bf9da9eda2,24/12/2021 02:26:25
22,https://www.amazon.co.uk/dp/B07S1XM3L7,"adidas Women's Retrorun Shoes Running, Core Bl...",B. Slate,NO SUPPORT! NOT FOR RUNNING!,I would NOT recommend these for running. They ...,1.0,True,Reviewed in the United States on 12 July 2020,19 people found this helpful,1bd3f6f9-6e70-50a8-a913-6c9af4f8c7c7,24/12/2021 02:26:25


In [9]:
# Text Cleaning and Regular Expression
def regex(text):
    """
    Applies regular expression to a text to remove punctuation marks
    """
    text = re.sub(r'[^\w\s]', "", text) # Replace punctuation marks with empty string
    text = re.sub(r'[\s+]', " ", text) # Replace multiple spaces with one space
    
    return text.strip()
    
def clean_text(dataframe, text_col):
    """
    Ensures data is consistent and removes punctuation for better model performance.
    """
    dataframe = dataframe.dropna(subset=[text_col]) # Remove rows with missing values in text column
    
    dataframe[text_col] = dataframe[text_col].apply(regex) # Remove punctuation marks
    
    return dataframe

df = clean_text(df, "review_text")

df.head()

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these Was looking for converses and these...,5.0,True,Reviewed in the United States on 2 June 2020,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/2021 02:26:25
14,https://www.amazon.co.uk/dp/B07S1XM3L7,"adidas Women's Retrorun Shoes Running, Core Bl...",Lindsay,Perfect right outta the box,True to size If between Id probably go with yo...,5.0,True,Reviewed in Canada on 20 October 2021,One person found this helpful,b64632c5-6f24-51eb-9275-6614fed29f1a,24/12/2021 02:26:25
17,https://www.amazon.co.uk/dp/B0125TMZGK,"Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK",Amazon Customer,Comfortable and attractive,I have hard to fit feet and often a wide fitti...,5.0,True,Reviewed in Canada on 8 October 2018,2 people found this helpful,bce0114a-c0fe-5472-bbb8-377cb21dc853,24/12/2021 02:26:25
21,https://www.amazon.co.uk/dp/B0125TMZGK,"Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK",Burger Lover - Dalton,Great quality and comfort shoes,Great quality and comfort shoesI was so thrill...,5.0,True,Reviewed in the United States on 27 August 2020,2 people found this helpful,f1bafbd5-ff6a-52c7-8929-68bf9da9eda2,24/12/2021 02:26:25
22,https://www.amazon.co.uk/dp/B07S1XM3L7,"adidas Women's Retrorun Shoes Running, Core Bl...",B. Slate,NO SUPPORT! NOT FOR RUNNING!,I would NOT recommend these for running They h...,1.0,True,Reviewed in the United States on 12 July 2020,19 people found this helpful,1bd3f6f9-6e70-50a8-a913-6c9af4f8c7c7,24/12/2021 02:26:25


In [None]:
# Tokenizaton
def tokenize(text):
    # Tokenizes a text and returns the tokens
    
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    
    doc = nlp(text) # Process the text
    tokens = [token.text for token in doc] # Stores the tokens
    
    return tokens

def tokenize_words(dataframe, text_col):
    """
    Tokenizes every row in the text column down. Creates a new column containing tokenized words.
    Returns the new DataFrame.
    """
    dataframe["tokenized_words"] = dataframe[text_col].apply(tokenize) # Tokenize words and add to new column
    
    return dataframe

df = tokenize_words(df, "review_text")

df.head()

In [None]:
# Stop Word Removal
def stop_word_filter(tokens):
    # Removes stop words from an array of tokens and returns the filtered tokens
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    stop_words = nlp.Defaults.stop_words # Create a list of stop words
    
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words] # Remove stop words
    
    return filtered_tokens

def remove_stop_words(dataframe, token_col):
    """
    Removes stop words from tokens array and returns the DataFrame.
    """
    dataframe[token_col] = dataframe[token_col].apply(stop_word_filter)
    
    return dataframe

df = remove_stop_words(df, "tokenized_words")

df.head()

In [None]:
# Lemmatization
def lemmatize(tokens):
    """
    Lemmatizes the text and returns the lemmatized words.
    """
    nlp = spacy.load("en_core_web_sm") # Create NLP Pipeline
    
    text = " ".join(tokens) # Create a text version of the tokens
    doc = nlp(text) # Process the text
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    return lemmatized_tokens

def lemmatize_tokens(dataframe, token_col):
    """
    Lemmatizes the tokens in the token column and returns the DataFrame.
    """
    dataframe[token_col] = dataframe[token_col].apply(lemmatize)
    
    return dataframe

df = lemmatize_tokens(df, "tokenized_words")

df.head()

In [None]:
# Vectorization
def vectorize(tokens):
    # Vectorizes the array of tokens and returns the array of vectors.
    nlp = spacy.load("en_core_web_lg") # Create NLP Pipeline
    
    text = " ".join(tokens) # Create a text version of the tokens
    doc = nlp(text) # Process the text
    
    vectorized_tokens = [doc.vector] # Vectorize tokens and insert into a list
    
    return vectorized_tokens
    
def vectorize_tokens(dataframe, token_col):
    """
    Takes tokens and provides a new column containing their vectors. 
    Returns the DataFrame.
    """
    nlp = spacy.load("en_core_web_lg") # Create NLP Pipeline
    
    dataframe["Vectors"] = dataframe[token_col].apply(vectorize) # Vectorize tokens and add to Vectors column
    
    return dataframe

df = vectorize_tokens(df, "tokenized_words")

df.head()

In [None]:
def get_sentiment(review):
    # Classifies the sentiment of the text into positive, negative or neutral using the review ratings
    if review >= 2.5:
        return 1
    else:
        return 0

def classify_sentiment(dataframe, review_col):
    """
    Takes the review and classifies a positive or negative sentiment.
    Inserts the sentiment in a new column.
    Returns the dataframe
    """
    dataframe["sentiment"] = dataframe[review_col].apply(get_sentiment) # Classify sentiment of tokens and insert into column
    
    return dataframe

df = classify_sentiment(df, "review_rating")

df.head()

In [None]:
def get_model_data(dataframe, cols):
    """
    Drops all columns that are not specified in the cols array and returns the DataFrame.
    """
    new_dataframe = dataframe[cols]
    
    return new_dataframe

cols = ["Vectors", "sentiment"]  # Define X and y features to isolate
model_df = get_model_data(df, cols)

model_df.head()

## Sentiment Analysis

In [None]:
def split_data(X, y):
    """
    Takes X and y variables and splits into train and test data.
    Returns X and y train and test data.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.75, test_size=0.25) # Split data into train and test
    
    return X_train, X_test, y_train, y_test

# Define X and y
X = model_df["Vectors"]
y = model_df["sentiment"]

X_train, X_test, y_train, y_test = split_data(X, y)  # Split data into train and test

In [None]:
def naive_bayes_model(X_train, X_test, y_train):
    # Fits train data into model and returns predicted values.
    scaler = MinMaxScaler() # Initialize scaler
    # Normalize the train and test data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = MultinomialNB() # Intialize model
    model.fit(X_train, y_train) # Fit train data into model
    
    y_pred = model.predict(X_test) # Classify text
    
    return y_pred

# Predict sentiment with Naive Bayes model
nb_y_pred = naive_bayes_model(X_train, X_test, y_train)

In [None]:
def lstm_model(X_train, X_test, y_train, y_test):
    """
    Creates an LSTM model and returns the predicted y values.
    Makes use of padding to ensure that the sequences are of equal length.
    Returns the predicted classifications of the values.
    """
    # Ensure train and test data are numpy arrays
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    
    max_sequence_length = 20 # Define max length for padding
    # Pad the sequences of train and test data
    X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length, padding="post", truncating="post", value=0.0)
    X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length, padding="post", truncating="post", value=0.0)
    print(f"X_train_padded: {X_train_padded}")
    
    model = Sequential() # Initialize model
    # Add LSTM model
    model.add(LSTM(128, input_shape=(X_train_padded.shape[1], X_train_padded.shape[2]), return_sequences=False))
    
    model.add(Dropout(0.5)) # Add dropout layer to prevent overfitting
    model.add(Dense(1, activation="sigmoid")) # Output layer
    
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy']) # Compile model
    model.fit(X_train_padded, y_train, epochs=2, batch_size=64, validation_data=(X_test_padded, y_test)) # Fit the model
    
    y_pred_prob = model.predict(X_test_padded) # Predict probability of positive classification
    y_pred = [int(pred >= 0.5) for pred in y_pred_prob] # Get a list of the classified values
    
    return y_pred

lstm_y_pred = lstm_model(X_train, X_test, y_train, y_test)  # Get LSTM Model predictions

## Analyse Results

## Aspect Based Sentiment Analysis


In [None]:
nlp = spacy.load("en_core_web_lg") # Load NLP Pipeline

def absa_model(text):
    """
    Takes in text and determines the text's aspects and the opinions associated with the aspects.
    Uses a list of words and sentiment polarity to classify the sentiment of the opinion.
    Aspects are determined as noun chunks.
    Opinions are determined as adjectives or adverbs.
    Returns a list with a tuple that contains the aspect, opinion and sentiment of the opinion.
    """
    # Define positive and negative words
    positive_words = ["amazing", "good", "great", "excellent", "fantastic", "positive", "happy"]
    negative_words = ["poor", "bad", "terrible", "horrible", "negative", "unhappy", "disappointing"]
    
    doc = nlp(text) # Process the text
    # Initialize lists for results
    aspects, opinions, sentiments = [], [], []
    
    # Get aspects by taking the noun chunks
    for chunk in doc.noun_chunks:
        aspect = chunk
        aspects.append(aspect)
        
    # Generate opinions by getting adjectives or adverbs
    opinions = [token.text for token in doc if token.pos_ in ["ADJ", "ADV"]]
    
    # Get Sentiment of each opinion
    for opinion in opinions:
        if opinion in positive_words:
            sentiment = "positive"
        elif opinion in negative_words:
            sentiment = "negative"
        else: # Use textblob for polarity
            sentiment = "positive" if TextBlob(opinion).sentiment.polarity >= 0 else "negative"
        
        sentiments.append(sentiment) # Add sentiment to the list
    
    return list(zip(aspects, opinions, sentiments))