In [2]:
'''Retrieve all inputs executed in the current kernel section'''
%history

'''Retrieve all inputs executed in the current kernel section'''
%history
'''Retrieve all inputs executed in the current kernel section'''
%history


In [3]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import warnings

In [4]:
print('Pandas version -> %s' % pd.__version__)
print('Numpy version -> %s' % np.__version__)
print('Seaborn version -> %s' % sns.__version__)

Pandas version -> 2.0.3
Numpy version -> 1.25.2
Seaborn version -> 0.13.1


In [5]:
df = pd.read_json('amazon-music-reviews.json', lines=True)

In [6]:
# Drop unnecessary columns from the DataFrame
df_reviews = df.drop(['reviewerName', 'helpful', 'unixReviewTime', 'reviewTime'],axis=1)

# Rename the remaining columns for better readability and consistency
df_reviews = df_reviews.rename(columns={'reviewerID': 'UserID',
                                        'asin': 'ProductID',
                                        'reviewText': 'Review',
                                        'summary': 'Summary',
                                        'overall': 'Rating'})

df_reviews.head(3)

Unnamed: 0,UserID,ProductID,Review,Rating,Summary
0,A2IBPI20UZIR0U,1384719342,"Not much to write about here, but it does exac...",5,good
1,A14VAT5EAX3D9S,1384719342,The product does exactly as it should and is q...,5,Jake
2,A195EZSQDW3E21,1384719342,The primary job of this device is to block the...,5,It Does The Job Well


In [7]:
# Display information about the DataFrame
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   UserID     10261 non-null  object
 1   ProductID  10261 non-null  object
 2   Review     10261 non-null  object
 3   Rating     10261 non-null  int64 
 4   Summary    10261 non-null  object
dtypes: int64(1), object(4)
memory usage: 400.9+ KB


In [8]:
# Display descriptive statistics of the DataFrame
df_reviews.describe()

Unnamed: 0,Rating
count,10261.0
mean,4.488744
std,0.894642
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


Singular Value Decomposition

In [9]:
# Drop the 'Review' and 'Summary' columns from the DataFrame
df_reviews_svd = df_reviews.drop(['Review', 'Summary'],axis=1)

df_reviews_svd.head(3)

Unnamed: 0,UserID,ProductID,Rating
0,A2IBPI20UZIR0U,1384719342,5
1,A14VAT5EAX3D9S,1384719342,5
2,A195EZSQDW3E21,1384719342,5


In [12]:
# Import necessary classes from the Surprise library
#!pip install surprise
from surprise import Dataset,Reader, SVD, accuracy
from surprise.model_selection import train_test_split

# Initialize a Reader object
reader = Reader()

# Load the dataset from the DataFrame for use with Surprise
data = Dataset.load_from_df(df_reviews_svd[['UserID','ProductID','Rating']],reader)

# Initialize the SVD (Singular Value Decomposition) algorithm
svd = SVD()

In [13]:
# Split the data into a training and a test set
train_set, test_set = train_test_split(data, test_size=0.25)

# Train the SVD algorithm on the training set and make predictions
svd.fit(train_set)
predictions = svd.test(test_set)

# Calculate and print the RMSE and the MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.8712
MAE:  0.6397


In [14]:
# Create a DataFrame with unique ProductIDs from the df_reviews DataFrame
products = pd.DataFrame({'ProductID': np.unique(df_reviews['ProductID'])}).reset_index().drop('index',axis=1)

In [15]:
# Select the UserID of the first customer in the df_reviews DataFrame
customer = df_reviews.UserID[0]

In [16]:
# Predict ratings for all products for a specific customer
products['Prediction'] = df_reviews['ProductID'].apply(lambda x: svd.predict(customer,x).est)

In [17]:
# Sort the products DataFrame by the 'Prediction' column in descending order
products.sort_values(by='Prediction',ascending=False)

Unnamed: 0,ProductID,Prediction
134,B0002E3FCO,4.980265
133,B0002E3DNK,4.980265
136,B0002E4Z8M,4.980265
137,B0002E51ZS,4.980265
132,B0002E3DLM,4.980265
...,...,...
884,B00C5B20QE,4.298063
882,B00BTGMI5O,4.298063
881,B00BLQTZDA,4.298063
880,B00BLQ7M4E,4.298063


In [18]:
products.describe()

Unnamed: 0,Prediction
count,900.0
mean,4.703365
std,0.187578
min,4.298063
25%,4.559466
50%,4.73949
75%,4.851198
max,4.980265


In [19]:
# Extract user factors (latent features) from the trained SVD model
user_factors = svd.pu

# Extract item factors (latent features) from the trained SVD model
item_factors = svd.qi

# Convert the factors numpy array to a pandas DataFrame
user_factors_df = pd.DataFrame(user_factors)
item_factors_df = pd.DataFrame(item_factors)

'''
User factors and item factors are essential components of the SVD model,
they enable the system to capture and represent user preferences and item characteristics
they represent a virtual relationship, not necessarily a physical one.
'''

'\nUser factors and item factors are essential components of the SVD model,\nthey enable the system to capture and represent user preferences and item characteristics\nthey represent a virtual relationship, not necessarily a physical one.\n'

In [20]:
user_factors_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.097108,0.066014,0.082023,0.074626,-0.051028,-0.096115,0.032278,-0.008107,-0.171527,-0.124719,...,0.022924,-0.076619,0.040123,0.017372,-0.142249,-0.088705,0.015926,0.040342,0.013752,0.064677
1,-0.06472,-0.050788,-0.097921,0.164877,0.13198,-0.070397,-0.028543,0.144525,0.109144,-0.213958,...,-0.10468,-0.101114,-0.109569,-0.04217,-0.121906,-0.084957,-0.193006,-0.075816,0.06716,-0.066231
2,-0.060605,0.12449,-0.061078,0.084183,-0.0187,-0.037629,0.020013,0.100119,0.15647,-0.183444,...,-0.046725,-0.045246,0.010831,0.242461,0.005267,0.090424,-0.06002,-0.180174,0.058551,0.091182


In [21]:
item_factors_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.002012,-0.08274,-0.066137,0.066244,0.077269,-0.110783,0.087076,0.202023,0.027708,-0.011298,...,-0.151379,0.028649,-0.091236,-0.022114,0.05604,-0.109629,-0.075814,0.022366,-0.028727,-0.058479
1,0.04553,0.03626,0.108041,-0.040843,0.134099,-0.095341,-0.200001,-0.001699,0.022064,-0.006087,...,-0.078932,-0.05574,0.290062,0.039305,-0.078385,0.005072,-0.088695,-0.008944,0.191042,-0.001464
2,-0.051784,-0.080196,-0.115984,0.128532,-0.071679,-0.084318,-0.085973,-0.065952,-0.130502,-0.007982,...,-0.038668,-0.141713,0.004525,-0.082436,-0.143931,0.103152,0.089285,-0.074326,-0.034244,-0.095116


Sentiment Analysis

In [22]:
# Drop unnecessary columns ('Review', 'Summary', 'UserID', 'ProductID') from the df_reviews DataFrame
df_reviews_sentiment = df_reviews.drop(['Review', 'Summary', 'UserID', 'ProductID'],axis=1)

# Combine the 'Summary' and 'Review' columns into a single 'Reviews' column
df_reviews_sentiment['Reviews'] = df_reviews['Summary'] + ' ' +  df_reviews['Review']

df_reviews_sentiment.head(3)

Unnamed: 0,Rating,Reviews
0,5,"good Not much to write about here, but it does..."
1,5,Jake The product does exactly as it should and...
2,5,It Does The Job Well The primary job of this d...


In [23]:
# Define thresholds for positive, negative, and neutral ratings
positive_threshold = 4
negative_threshold = 3

# Split the "overall" column into positive, negative, and neutral categories
df_reviews_sentiment['Sentiment'] = df_reviews_sentiment['Rating'].apply(lambda rating: '1'
                                    if rating > positive_threshold
                                    else '-1' if rating < negative_threshold
                                    else '0')

df_reviews_sentiment['Sentiment'].value_counts()

Sentiment
1     6938
0     2856
-1     467
Name: count, dtype: int64

In [24]:
# Drop the 'Rating' column from the df_reviews_sentiment DataFrame
df_reviews_sentiment.drop('Rating', axis=1, inplace=True)
df_reviews_sentiment.head(3)

Unnamed: 0,Reviews,Sentiment
0,"good Not much to write about here, but it does...",1
1,Jake The product does exactly as it should and...,1
2,It Does The Job Well The primary job of this d...,1


In [25]:
# Import the spacy library
import spacy
import string

# Load the English language model
en_model = spacy.load("en_core_web_sm")

# Define a function for text preprocessing using spacy
def text_preprocessing(text):
    # Parse the text using spaCy
    parsed_text = en_model(text)

    # Initialize an empty list to store cleaned tokens
    cleaned_tokens = []

    # Iterate over each token in the parsed document
    for token in parsed_text:
        # Check if the token is a word (alphabetical), not a punctuation or a stop word
        if token.is_alpha and not token.is_punct and not token.is_stop:
            # Lemmatize the token and convert it to lowercase
            cleaned_token = token.lemma_.lower()
            # Append the cleaned token to the list
            cleaned_tokens.append(cleaned_token)

    # Join tokens back into a string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

In [26]:
# Apply the preprocessing function to the 'review' column
df_reviews_sentiment['CleanedReviews'] = df_reviews_sentiment['Reviews'].apply(text_preprocessing)

df_reviews_sentiment.head()

Unnamed: 0,Reviews,Sentiment,CleanedReviews
0,"good Not much to write about here, but it does...",1,good write exactly suppose filter pop sound re...
1,Jake The product does exactly as it should and...,1,jake product exactly affordable realize double...
2,It Does The Job Well The primary job of this d...,1,job primary job device block breath produce po...
3,GOOD WINDSCREEN FOR THE MONEY Nice windscreen ...,1,good windscreen money nice windscreen protect ...
4,No more pops when I record my vocals. This pop...,1,pop record vocal pop filter great look perform...


In [27]:
# Drop the column named 'Reviews' from the DataFrame 'df_reviews_sentiment'
df_reviews_sentiment.drop('Reviews', axis = 1, inplace = True)

In [28]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assign predictor (independent) and target (dependent) variables.
predictors = df_reviews_sentiment['CleanedReviews']
targets = df_reviews_sentiment['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(predictors, targets, test_size=0.2, random_state=42)

In [29]:
# Vectorize the text data using TF-IDF (Term Frequency-Inverse Document Frequency) representation
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [30]:
# Create a Random Forest classifier model with 100 decision trees.
rf_model = RandomForestClassifier(n_estimators=100)

# Train the Random Forest classifier model using the TF-IDF transformed training data.
rf_model.fit(X_train_tfidf, y_train)

# Make predictions and report results
rf_pred = rf_model.predict(X_test_tfidf)
rf_report = classification_report(y_test, rf_pred)

# Print the classification report
print('Classification Report:')
print(rf_report)

Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.02      0.04       104
           0       0.67      0.16      0.26       551
           1       0.71      0.98      0.83      1398

    accuracy                           0.71      2053
   macro avg       0.68      0.39      0.38      2053
weighted avg       0.70      0.71      0.63      2053



In [31]:
# Import necessary metrics from sklearn.metrics module for evaluating classifier performance.
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, average='weighted')
rf_recall = recall_score(y_test, rf_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_pred, average='weighted')

print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)

Accuracy: 0.7111544081831466
Precision: 0.7002252550982709
Recall: 0.7111544081831466
F1-score: 0.6346072930937523
