#  Problem Statement: Recommendation system for products on an e-commerce website like Amazon.com. 

# Primary Dataset columns​ -
    - first column - userId
    - second column - productId
    - third column - ratings
    - fourth column - timestamp

# Steps - 
**1. Read and explore the given dataset.  ( Rename column/add headers, plot histograms, find data characteristics)** 

**2. Take a subset of the dataset to make it less sparse/ denser. ( For example, keep the users only who has given 50 or more number of ratings )**

**3. Split the data randomly into train and test dataset. ( For example, split it in 70/30 ratio)**

**4. Build Popularity Recommender model.**

**5. Build Collaborative Filtering model.**

**6. Evaluate both the models. ( Once the model is trained on the training data, it can be used to compute the error (like RMSE) on predictions made on the test data.) You can also use a different method to evaluate the models.**

**7. Get top - K ( K = 5) recommendations. Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.**

**8. Summarise your insights.** 
 

# Load necessary libraries

In [4]:
import pandas as pd # data manipulation and analysis
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # data visualisation library
from pandas.io.json import json_normalize

import warnings
import json

warnings.filterwarnings('ignore')
%matplotlib inline

In [13]:
from surprise import KNNBasic, SVD, NormalPredictor, KNNBaseline,KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering, Reader, dataset, accuracy

# Read and explore the dataset  ( Rename column/add headers, plot histograms, find data characteristics) 

In [4]:
columns = ['userID', 'productID', 'ratings','timestamp']

recomm_df = pd.read_csv(r'C:\Users\navee\Downloads\ratings_Electronics.csv',names=columns)

In [7]:
recomm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048574 entries, 0 to 1048573
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userID     1048574 non-null  object
 1   productID  1048574 non-null  object
 2   ratings    1048574 non-null  int64 
 3   timestamp  1048574 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 32.0+ MB


In [8]:
recomm_df.head()

Unnamed: 0,userID,productID,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5,1365811200
1,A2CX7LUOHB2NDG,321732944,5,1341100800
2,A1GI0U4ZRJA8WN,439886341,1,1334707200
3,A1QGNMC6O1VW39,511189877,5,1397433600
4,A3J3BRHTDRFJ2G,511189877,2,1397433600


In [9]:
recomm_df.shape

(1048574, 4)

#  Pre-Processing 0: Dropping the "timestamp" as it is not a needed field

In [5]:
recomm_df = recomm_df.drop('timestamp', axis=1)

# Missing Value

In [61]:
recomm_df.isna().sum()

userID       0
productID    0
ratings      0
dtype: int64

### ----------Add code to handle missing values-----------
df.dropna(axis=0)

In [62]:
recomm_df.shape

(1048574, 3)

# Case 1: Cold Start Problem

In [1]:
popular = recomm_df[['userID','ratings']].groupby('userID').sum().reset_index()
popular_20 = popular.sort_values('ratings', ascending=False).head(n=20)
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
 
objects = (list(popular_20['userID']))
y_pos = np.arange(len(objects))
performance = list(popular_20['ratings'])
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation='vertical')
plt.ylabel('Total Rating')
plt.title('Most popular')
 
plt.show()

NameError: name 'recomm_df' is not defined

In [65]:
# find unique users
recomm_df.userID.value_counts()

A5JLAU2ARJ0BO     412
A231WM2Z2JL0U3    249
A25HBO5V8S8SEA    164
A6FIAB28IS79      146
AT6CZDCP4TRGA     128
                 ... 
A2Z9BZN45ZIWOZ      1
A30YUVSUTWY1BH      1
A2ELR81SA242O9      1
A3LTZM4BD2W2BS      1
A15GM3MGD056S       1
Name: userID, Length: 786328, dtype: int64

In [66]:
print('Number of unique users', len(recomm_df['userID'].unique()))

Number of unique users 786328


In [67]:
print('Number of unique products', len(recomm_df['productID'].unique()))

Number of unique products 61894


In [68]:
print('Unique Ratings', recomm_df['ratings'].unique())

Unique Ratings [5 1 2 4 3]


In [69]:
min_ratings1 = recomm_df[(recomm_df['ratings'] < 2.0)]

In [70]:
print('Number of unique products rated low',len(min_ratings1['productID'].unique()))

Number of unique products rated low 26111


In [71]:
med_ratings1 = recomm_df[(recomm_df['ratings'] > 2.0) & (recomm_df['ratings'] < 4.0)]

In [72]:
print('Number of unique products rated medium',len(med_ratings1['productID'].unique()))

Number of unique products rated medium 22236


In [73]:
max_ratings1 = recomm_df[recomm_df['ratings'] >= 4.0]

In [74]:
print('Number of unique products rated high',len(max_ratings1['productID'].unique()))

Number of unique products rated high 54985


In [75]:
avg_rating_prod = recomm_df.groupby('productID').sum() / recomm_df.groupby('productID').count()

In [76]:
avg_rating_prod.drop('userID', axis=1,inplace =True)

In [77]:
print ('Top 10 highly rated products \n',avg_rating_prod.nlargest(10,'ratings'))

Top 10 highly rated products 
             ratings
productID          
059400232X      5.0
094339676X      5.0
1039869017      5.0
1182702627      5.0
132793040       5.0
1575839415      5.0
1593152523      5.0
1594243034      5.0
1604550945      5.0
1610130804      5.0


# Case 2: Using Already Available Data for Predictions

# Pre-processing 1: Take a subset of the dataset to make it less sparse/ denser. ( For example, keep the users only who has given 50 or more number of ratings )

In [78]:
userID = recomm_df.groupby('userID').count()

In [80]:
top_user = userID[userID['ratings'] >= 50].index

In [81]:
topuser_ratings_df = recomm_df[recomm_df['userID'].isin(top_user)]

In [364]:
#topuser_ratings_df.drop('productID', axis=1, inplace = True)

In [82]:
topuser_ratings_df.shape

(3456, 3)

In [83]:
topuser_ratings_df.head()

Unnamed: 0,userID,productID,ratings
2160,A5JLAU2ARJ0BO,1400532655,1
3048,ALUNVOQRXOZIA,1400599997,1
4612,A1VQHH85U7PX0,9043413585,3
6070,A250AXLRBVYKB4,998376234X,1
7379,A2AEZQ3DGBBLPR,B000000O48,5


In [84]:
topuser_ratings_df.sort_values(by='ratings', ascending=False).head()

Unnamed: 0,userID,productID,ratings
935141,A680RUE1FDO8B,B000FJ89UO,5
252702,A2XRMQA6PJ5ZJ8,B00008RUXO,5
252615,ABH9D1TEOJ56E,B00008RUEO,5
579412,AGHZXQL9F94T9,B00074Z9OQ,5
252513,A7Y6AVS576M03,B00008RS95,5



# Pre-Processing 2: Keep data only for products that have 50 or more ratings

In [85]:
prodID = recomm_df.groupby('productID').count()

In [86]:
top_prod = prodID[prodID['ratings'] >= 50].index

In [87]:
top_ratings_df = topuser_ratings_df[topuser_ratings_df['productID'].isin(top_prod)]

In [88]:
top_ratings_df.sort_values(by='ratings', ascending=False).head()

Unnamed: 0,userID,productID,ratings
365904,A2XRMQA6PJ5ZJ8,B00015GYU4,5
683921,A5JLAU2ARJ0BO,B0009ST7DY,5
700439,A2AEZQ3DGBBLPR,B000A3WS20,5
700137,AGXV8E7XK9J6H,B000A3WS16,5
693051,A6FIAB28IS79,B0009Y7APU,5


In [89]:
top_ratings_df.shape

(1339, 3)

# Pre-Processing 3 - Split the data randomly into train and test dataset. ( For example, split it in 70/30 ratio)

In [90]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(top_ratings_df, test_size = 0.30, random_state=0)

In [91]:
train_data.head()

Unnamed: 0,userID,productID,ratings
61810,AGHZXQL9F94T9,B00004XRED,5
278440,A15QYHQH80W30L,B00009MDBQ,5
597547,AGHZXQL9F94T9,B0007MVHZS,4
856624,A1RPTVW5VEOSI,B000EIDARS,4
104772,A25HBO5V8S8SEA,B00005MIS8,5


In [2]:
train_data.shape

NameError: name 'train_data' is not defined

In [92]:
test_data.head()

Unnamed: 0,userID,productID,ratings
327024,A2XRMQA6PJ5ZJ8,B0000BVDM7,5
715194,A1F9Z42CFF9IAY,B000AJIF4E,2
535952,AT6CZDCP4TRGA,B00062QY5W,1
56637,A2BGZ52M908MJY,B00004W3YX,4
425258,A680RUE1FDO8B,B00022VBAE,4


In [None]:
test_data.shape

# Processing Begins

# Model 1: Build Popularity Recommender model.

In [93]:
#Building the recommendations based on the average of all user ratings for each product.
train_data_grouped = train_data.groupby('productID').mean().reset_index()

In [94]:
train_data_grouped.head()

Unnamed: 0,productID,ratings
0,1400532655,1.0
1,B000001OMI,5.0
2,B00000DM9W,5.0
3,B00000J05A,3.0
4,B00000J061,5.0


In [95]:
train_data_sort = train_data_grouped.sort_values(['ratings', 'productID'], ascending=False)

In [96]:
train_data_sort.head()

Unnamed: 0,productID,ratings
749,B000ICZZJG,5.0
746,B000I20AJ6,5.0
745,B000I1ZWRC,5.0
743,B000I1X6PM,5.0
741,B000HZD2XK,5.0


In [97]:
train_data.groupby('productID')['ratings'].count().sort_values(ascending=False).head(10) 

productID
B000BKJZ9Q    5
B00007KDVI    4
B0000AKVHF    4
B0009RKL5S    3
B0000659UN    3
B00009R6TA    3
B00004SB92    3
B00004ZCJE    3
B0000645C8    3
B00008VF63    3
Name: ratings, dtype: int64

In [98]:
ratings_mean_count = pd.DataFrame(train_data.groupby('productID')['ratings'].mean()) 

In [99]:
ratings_mean_count['rating_counts'] = pd.DataFrame(train_data.groupby('productID')['ratings'].count())  

In [100]:
ratings_mean_count.head()  

Unnamed: 0_level_0,ratings,rating_counts
productID,Unnamed: 1_level_1,Unnamed: 2_level_1
1400532655,1.0,1
B000001OMI,5.0,1
B00000DM9W,5.0,1
B00000J05A,3.0,1
B00000J061,5.0,1


In [101]:
pred_df = test_data[['userID', 'productID', 'ratings']]

In [102]:
pred_df.rename(columns = {'ratings' : 'true_ratings'}, inplace=True)

In [3]:
pred_df.head(3)

NameError: name 'pred_df' is not defined

In [103]:
pred_df = pred_df.merge(train_data_sort, left_on='productID', right_on = 'productID')

In [104]:
pred_df.head(3)

Unnamed: 0,userID,productID,true_ratings,ratings
0,A1F9Z42CFF9IAY,B000AJIF4E,2,5.0
1,AT6CZDCP4TRGA,B00062QY5W,1,5.0
2,A2BGZ52M908MJY,B00004W3YX,4,5.0


In [105]:
pred_df.rename(columns = {'ratings' : 'predicted_ratings'}, inplace = True)

In [106]:
pred_df.head()

Unnamed: 0,userID,productID,true_ratings,predicted_ratings
0,A1F9Z42CFF9IAY,B000AJIF4E,2,5.0
1,AT6CZDCP4TRGA,B00062QY5W,1,5.0
2,A2BGZ52M908MJY,B00004W3YX,4,5.0
3,A2B7BUH8834Y6M,B000BQ7GW8,5,5.0
4,A231WM2Z2JL0U3,B00007K3A3,4,5.0


In [107]:
import sklearn.metrics as metric
from math import sqrt
MSE = metric.mean_squared_error(pred_df['true_ratings'], pred_df['predicted_ratings'])
print('The RMSE value for Popularity Recommender model is', sqrt(MSE))

The RMSE value for Popularity Recommender model is 1.489930358479308


# Model 2: Build Collaborative Filtering model

In [110]:
conda install -c conda-forge scikit-surprise

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\navee\anaconda3

  added / updated specs:
    - scikit-surprise


The following packages will be downloaded:

Note: you may need to restart the kernel to use updated packages.
    package                    |            build
    ---------------------------|-----------------
    conda-4.10.1               |   py38haa244fe_0         3.1 MB  conda-forge
    python_abi-3.8             |           1_cp38           4 KB  conda-forge
    scikit-surprise-1.1.1      |   py38h1e00858_1         567 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.6 MB






  current version: 4.8.3
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda




The following NEW packages will be INSTALLED:

  python_abi         conda-forge/win-64::python_abi-3.8-1_cp38
  scikit-surprise    conda-forge/win-64::scikit-surprise-1.1.1-py38h1e00858_1

The following packages will be UPDATED:

  conda                       pkgs/main::conda-4.8.3-py38_0 --> conda-forge::conda-4.10.1-py38haa244fe_0



Downloading and Extracting Packages

scikit-surprise-1.1. | 567 KB    |            |   0% 
scikit-surprise-1.1. | 567 KB    | 2          |   3% 
scikit-surprise-1.1. | 567 KB    | 5          |   6% 
scikit-surprise-1.1. | 567 KB    | ##8        |  28% 
scikit-surprise-1.1. | 567 KB    | ####2      |  42% 
scikit-surprise-1.1. | 567 KB    | ######2    |  62% 
scikit-surprise-1.1. | 567 KB    | ########1  |  82% 
scikit-surprise-1.1. | 567 KB    | ########## | 100% 

python_abi-3.8       | 4 KB      |            |   0% 
python_abi-3.8       | 4 KB      | ########## | 100% 

conda-4.10.1         | 3.1 MB    |            |   0% 
conda-4.10.1         | 3.1 MB

In [111]:
import surprise
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [112]:
reader = Reader(rating_scale=(0.5, 5.0))

# Converting Pandas Dataframe to Surpise format

In [113]:
data = Dataset.load_from_df(top_ratings_df[['userID', 'productID', 'ratings']],reader)

In [114]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.3,random_state=0)

In [115]:
type(trainset)

surprise.trainset.Trainset

# Training the model

**KNNWithMeans**

In [116]:
algo_user = KNNWithMeans(k=10, min_k=6, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_user.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x22e0e211f10>

**SVD**

In [119]:
svd_model = SVD(n_factors=50,reg_all=0.02)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22e0e211820>

#  Evaluate both the models. ( Once the model is trained on the training data, it can be used to compute the error (like RMSE) on predictions made on the test data.) You can also use a different method to evaluate the models.

 **Popularity Recommender Model (RMSE)**


In [120]:
MSE = metric.mean_squared_error(pred_df['true_ratings'], pred_df['predicted_ratings'])
print('The RMSE value for Popularity Recommender model is', sqrt(MSE))

The RMSE value for Popularity Recommender model is 1.489930358479308


**Collaborative Filtering Recommender Model (RMSE)**

In [121]:
print(len(testset))
type(testset)

402


list

**KNNWithMeans**

In [122]:
# Evalute on test set
test_pred = algo_user.test(testset)
test_pred[0]

Prediction(uid='A11D1KHM7DVOQK', iid='B00005T39Y', r_ui=4.0, est=4.193169690501601, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [123]:
# compute RMSE
accuracy.rmse(test_pred) #range of value of error

RMSE: 1.0826


1.082561990239296

**SVD**

In [124]:
test_pred = svd_model.test(testset)

In [125]:
# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.0517


1.051664196582147

 **Parameter tuning of SVD Recommendation system**

In [126]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)

In [127]:
gs.fit(data)

In [128]:
# get best parameters
gs.best_params

{'rmse': {'n_factors': 10, 'reg_all': 0.02}}

In [129]:
# Use the "best model" for prediction
gs.test(testset)
accuracy.rmse(gs.test(testset))

RMSE: 0.8976


0.897648395759205

# Get top - K ( K = 5) recommendations. Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.

In [130]:
from collections import defaultdict
def get_top_n(predictions, n=5):
  
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [131]:
top_n = get_top_n(test_pred, n=5)

In [132]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

A11D1KHM7DVOQK ['B00005T39Y', 'B0001IXUEE', 'B000069106', 'B00007DN1E', 'B0006B486K']
A38QEATH3NODFM ['B00009QSET', 'B000EUJ1Q0', 'B000BONJXU', 'B0001H4BMU', 'B000F3H9UG']
A1FR68QH6Z4YZM ['B0000AKVHF', 'B00020KPB2', 'B000FKQ8LA', 'B000AZ57M6', 'B0000DYTDJ']
A2AY4YUOX2N1BQ ['B0002LEMWE', 'B00009UT3F', 'B00006RH5I', 'B0002L5R78', 'B0007QU6WY']
A2AEZQ3DGBBLPR ['B00005QXWI', 'B000A3WS20', 'B00004Y2MM', 'B000066TPN', 'B000056SSM']
A231WM2Z2JL0U3 ['B00004VX3T', 'B00005QFZG', 'B000050NON', 'B00006IS65', 'B00005QXWI']
A5JLAU2ARJ0BO ['B00005MOZG', 'B0001F22PA', 'B000BKJZ9Q', 'B00008I9K8', 'B0000CEOYI']
A2R6RA8FRBS608 ['B00004Z5M1', 'B0000E1717', 'B0002Z240M', 'B0009MX5R4', 'B00006IW1X']
A3OXHLG6DIBRW8 ['B000BKY3Q6', 'B000B5H2BE', 'B000F5FKHI', 'B0009PAYY8', 'B000G5S4JI']
AGXV8E7XK9J6H ['B000AZ57M6', 'B0007WK8KS', 'B00009R6TA', 'B0002YE686', 'B00092GM0Q']
A680RUE1FDO8B ['B0002CPBUK', 'B000E5CYW8', 'B0009S5HQA', 'B0002MPMKE', 'B000AA2IC8']
ALUNVOQRXOZIA ['B000EVS4YE', 'B00006G2OJ', 'B00065AO0K', 

# Case 3: Cluster method search based recommendation

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [22]:
product_descriptions = pd.read_csv('C:/College work/ML Database/product.csv')
product_descriptions = product_descriptions.dropna()
product_descriptions.shape

(100, 2)

In [23]:
product_descriptions1 = product_descriptions

In [24]:
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(product_descriptions1["product_description"])
X=X1

kmeans = KMeans(n_clusters = 10, init = 'k-means++')
y_kmeans = kmeans.fit_predict(X)

In [28]:
def print_cluster(i):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

In [33]:
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X1)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [34]:
def show_recommendations(product):
    #print("Cluster ID:")
    Y = vectorizer.transform([product])
    prediction = model.predict(Y)
    #print(prediction)
    print_cluster(prediction[0])

In [35]:
show_recommendations("Water")

Cluster 9:
 water
 help
 head
 design
 toilet
 helps
 sprinkler
 stainless
 easy
 resist
