In [8]:
!pip install emoji

Defaulting to user installation because normal site-packages is not writeable


In [9]:
import numpy as np
import pandas as pd
import pickle
import re
import string
import emoji

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

In [10]:
data_path = './assets/data_dummy.xlsx'
data = pd.read_excel(data_path)

display(data.sample(n=20, random_state=5))

Unnamed: 0,user_id,laundry_id,rating,sentiment_score
37,20,1006,1,1.170091
42,7,1005,5,1.445501
74,20,1022,1,4.091755
17,24,1020,3,0.494271
23,10,1005,4,3.824812
10,10,1020,1,0.178051
90,19,1017,3,0.378174
45,5,1003,1,3.186602
57,15,1016,5,3.832362
6,14,1024,5,4.376029


In [11]:
# Function to create pivot table
def createPivot(data, fillVal = None):
  pivot_table = data.pivot_table(index='user_id', columns='laundry_id', values='rating')
  return pivot_table.fillna(fillVal) if fillVal is not None else pivot_table

In [12]:
random_state = 10
test_size = 0.30 

# Split data into train and test sets
train, test = train_test_split(data, test_size=test_size, random_state=random_state)
test = test[test['user_id'].isin(train['user_id'])]

# Create a pivot table
train_pivot = createPivot(train, 0)
test_pivot = createPivot(test, 0)

# Convert ratings to binary values
train_binary = train.copy()
train_binary['rating'] = train_binary['rating'].apply(lambda x: 0 if x >= 1 else 1)
train_binary = createPivot(train_binary, 1)

test_binary = test.copy()
test_binary['rating'] = test_binary['rating'].apply(lambda x: 1 if x >= 1 else 0)
test_binary = createPivot(test_binary, 0)

In [13]:
# Compute mean ratings
mean = np.nanmean(createPivot(train), axis=1)

# Subtract mean from train pivot table
train_subtracted_by_mean = (createPivot(train).T - mean).T
train_subtracted_by_mean.fillna(0, inplace=True)

# Calculate correlation between user
correlation = 1 - pairwise_distances(train_subtracted_by_mean, metric='cosine')
correlation[np.isnan(correlation)] = 0

correlation_df = pd.DataFrame(correlation)
correlation_df['user_id'] = train_subtracted_by_mean.index
correlation_df.set_index('user_id',inplace=True)
correlation_df.columns = train_subtracted_by_mean.index.tolist()

correlation_df.shape, train_pivot.shape

((25, 25), (25, 26))

In [14]:
# Compute predicted ratings
predicted_rating = np.dot(correlation, train_pivot)
final_rating = np.multiply(predicted_rating, train_binary)

final_rating

laundry_id,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,...,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.154701,0.138281,1.454627,0.0,1.095445,0.0,0.0,0.0,0.0,0.981467,...,2.82218,0.0,0.831216,0.0,0.589256,1.643168,0.0,-0.519701,1.443376,0.0
3,0.0,0.0,-0.20421,-0.460044,0.243018,-2.027532,0.0,0.0,-0.161734,-0.0,...,-1.1004,-1.099552,0.270435,0.0,0.0,0.0,-0.956107,0.0,-1.500248,0.0
4,0.0,-0.534749,-1.642917,-1.06066,-0.406333,-0.534749,-0.353553,0.0,0.039528,0.158114,...,-2.199889,-1.069497,0.0,0.0,0.0,0.0,0.0,0.039528,0.0,0.0
5,0.0,-0.466767,-1.434055,-1.270853,0.0,-0.466767,-0.308607,0.0,0.034503,0.138013,...,-1.747704,-0.243468,0.0,0.872872,0.0,0.517549,-0.690066,0.034503,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,-1.527525,0.0,0.0,0.0,1.5,0.0,0.0,0.592927,0.0,...,2.964635,0.0,1.778781,0.0,0.0,0.0,0.0,0.0,0.0,-1.909407
8,2.0,-1.320104,0.93581,0.0,0.0,0.0,0.0,0.0,0.237171,0.0,...,2.862932,2.512873,1.246261,0.0,1.020621,0.0,0.0,0.846032,0.0,-1.45254
9,0.0,0.0,0.0,0.0,-0.316228,0.204124,0.0,0.57735,0.0,0.0,...,0.04601,-0.632456,0.0,0.0,0.0,0.0,1.154701,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,1.505132,1.050813,0.206461,0.0,0.342026,0.226134,0.0,-0.025282,0.189646,...,1.373344,0.549213,0.0,-0.639602,0.0,-0.10113,-1.497792,0.847046,0.0,1.453882


In [15]:
# Filter user correlation only for users in the test set
correlation_test_df = correlation_df[correlation_df.index.isin(test.user_id)]
correlation_test_df = correlation_test_df[list(set(test.user_id))]

# Get test user predicted ratings
test_user_predicted_ratings = np.dot(correlation_test_df, test_pivot)
test_user_predicted_ratings *= test_binary
test_user_predicted_ratings[test_user_predicted_ratings <= 0] = np.nan

# Scale the predicted ratings between 1 and 5
scaler = MinMaxScaler(feature_range=(1, 5))
test_user_predicted_ratings = scaler.fit_transform(test_user_predicted_ratings)

# Metric evaluation
actual_ratings = createPivot(test)

mse = np.nanmean((actual_ratings - test_user_predicted_ratings)**2)
rmse = np.sqrt(mse)
mae = np.nanmean(np.abs(actual_ratings - test_user_predicted_ratings))

print("[Metric Evaluation]")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)

[Metric Evaluation]
Mean Squared Error (MSE): 2.703201277587481
Root Mean Squared Error (RMSE): 1.6441415016924428
Mean Absolute Error (MAE): 1.0779109356454815


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [16]:
def getReviewData(x):
  product_sentiments = data[data['laundry_id'] == x]['sentiment_score']
  sentiment_score = product_sentiments.mean()
  return sentiment_score

def rankRecommendation(final_rating, user_id, data, no_rec):
  user_ratings = final_rating.loc[user_id]
  recommendation = user_ratings.sort_values(ascending=False)[:no_rec]

  recommendation_table = pd.DataFrame(recommendation).reset_index().rename(columns={user_id: 'predicted_ratings'})
  recommendation_table.insert(0, 'user_id', user_id)
  # recommendation_table = recommendation_table[recommendation_table['predicted_ratings'] != 0]
  # display(recommendation_table)

  recommendation_table['sentiment_score'] = recommendation_table['laundry_id'].apply(getReviewData)
  recommendation_table['product_ranking_score'] = recommendation_table['predicted_ratings'] + recommendation_table['sentiment_score']
  display(recommendation_table.sort_values(by='product_ranking_score', ascending=False).head(no_rec))

In [17]:
user_id = 25
no_rec = 15

recommendation_table = rankRecommendation(final_rating, user_id, data, no_rec)

Unnamed: 0,user_id,laundry_id,predicted_ratings,sentiment_score,product_ranking_score
1,25,1014,2.3144,3.043455,5.357855
0,25,1013,2.570524,2.28727,4.857793
2,25,1011,0.57735,4.194748,4.772099
12,25,1008,0.0,4.504907,4.504907
5,25,1012,0.436436,3.793639,4.230075
7,25,1021,0.288675,3.820625,4.1093
10,25,1019,0.0,3.898578,3.898578
11,25,1024,0.0,3.563177,3.563177
4,25,1025,0.545545,2.479635,3.025179
9,25,1000,0.0,2.965344,2.965344
