## E-COMMERCE RECOMMENDER SYTEM

## MODELLING

## Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
# from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

## Loading data

In [3]:
data = pd.read_csv('/Users/judith/Data_science_projects/Springboard_AssignmentsJY/capstone_three/data/processed/processed_ratings_data.csv')

In [4]:
data.head()

Unnamed: 0,item_id,user_id,rating
0,7443,Alex,4
1,7443,carolyn.agan,3
2,7443,Robyn,4
3,7443,De,4
4,7443,tasha,4


In [5]:
# checking that there is no missing values in the data
data.isnull().sum()

item_id    0
user_id    0
rating     0
dtype: int64

**Key takeouts from data wrangling and EDA suggest the following approaches:**

- Collaborative Filtering
    - we can leverage collaborative filtering to design the recommender system using a model that use historical data but also learn patterns as the number of ratings increases. For this we will apply 2 algorithms: KNN and SVD. 

- Data qualiy
    - There is considerable number of products with low ratings which is the case for the 'cold start' problem. Hence for our model to be effective, we will only consider in users having at least 3 reviews (average is 43 reviews per user).

- Modelling
    - Since there are far more users than products, it seems logical to use a user approach. This will be the first model using KNN.

## Data munging

In [6]:
# Grouping the dataframe to have the number of unique users
data_grouped = data.groupby(['user_id', 'item_id']).size().groupby('user_id').size()

In [7]:
# Calculating the number of users with at least 3 reviews to include in the model
data_short = data_grouped[data_grouped >= 3].reset_index()[['user_id']]
print('Total number of users {}'.format(len(data_grouped)))
print('Number of users with at least 10 ratings {}'.format(len(data_short)))

Total number of users 44783
Number of users with at least 10 ratings 6866


In [8]:
# merging data_short and the data to have a final dataset ready for modelling
selection = data.merge(data_short, how = 'right',left_on = 'user_id', right_on = 'user_id')
selection.head()

Unnamed: 0,item_id,user_id,rating
0,105202,19lovelikecrazy95,5
1,57369,19lovelikecrazy95,4
2,118317,19lovelikecrazy95,3
3,32406,1dianaoliver,3
4,116313,1dianaoliver,1


In [9]:
print('Total number of interactions: {}'.format(len(data)))
print('Total number of interactions from users with at least 3 reviews: {}'.format(len(selection)))

Total number of interactions: 99892
Total number of interactions from users with at least 3 reviews: 54365


## Split train and test sets

In [10]:
train, test = train_test_split(selection, stratify = selection['user_id'],
                              test_size = 0.2, random_state = 42)

print('train size = {}'.format(len(train)))
print('test size = {}'.format(len(test)))

train size = 43492
test size = 10873


## Matrix Factorization

In [11]:
# creating a sparse matrix
rating_pivot = train.pivot(index = 'user_id', 
                          columns = 'item_id', 
                          values = 'rating').fillna(0)

In [12]:
rating_pivot.head()

item_id,6454,7443,11960,16411,21296,22563,24853,27439,27590,28252,...,155090,155165,155293,155305,155307,155308,155317,155537,155597,155950
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19lovelikecrazy95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1dianaoliver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3chuckleheads,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4jess,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7578042,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
rating_matrix = rating_pivot.to_numpy()
rating_csr_matrix = csr_matrix(rating_matrix)

In [14]:
U, sigma, Vt = svds(rating_csr_matrix, k = 15)

In [15]:
U.shape

(6866, 15)

In [16]:
Vt.shape

(15, 1007)

In [17]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [18]:
predicted_rating = np.dot(np.dot(U, sigma), Vt)
predicted_rating

array([[ 2.75192563e-04,  1.78305106e-01,  1.65969500e-01, ...,
        -5.95479371e-03, -6.72155157e-04, -2.80975475e-05],
       [-2.46172277e-03,  1.18905471e-01,  4.17804844e-02, ...,
        -5.72085963e-03, -4.22946042e-04,  1.89071601e-02],
       [ 4.93324727e-02,  1.63123000e-01, -9.94252603e-02, ...,
        -4.66037631e-03,  5.96405671e-04, -1.01144399e-02],
       ...,
       [ 1.36364624e-03, -1.63094334e-03, -6.01779358e-05, ...,
         4.23475113e-04,  2.71693816e-05,  3.32806537e-04],
       [ 1.11944561e-01,  3.70358557e-02,  1.67525901e-01, ...,
         3.81931126e-02, -8.16698957e-04,  3.78808114e-02],
       [ 3.22600405e-02,  4.10092366e-02,  4.55589574e-02, ...,
         1.25956345e-02, -6.79596620e-05,  1.26729309e-02]])

In [19]:
predicted_rating_norm = (predicted_rating - predicted_rating.min()) / (predicted_rating.max() - predicted_rating.min())

In [20]:
users_ids = list(rating_pivot.index)

In [21]:
predicted_rating_df = pd.DataFrame(predicted_rating_norm, columns = rating_pivot.columns,
                                  index = users_ids).transpose()

In [22]:
predicted_rating_df.head()

Unnamed: 0_level_0,19lovelikecrazy95,1dianaoliver,3chuckleheads,4jess,7578042,92647dlb,92dresstoimpress,:),?mily,A,...,zigzag487,zoe,zoe.emery,zoefaye,zoiekate,zolaluna,zombieroses23,zoshia,zulemaphone,zurajohnson
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6454,0.317101,0.316887,0.320944,0.31715,0.317925,0.320762,0.318979,0.319354,0.316738,0.321356,...,0.321548,0.318285,0.316347,0.317765,0.317615,0.31684,0.317342,0.317187,0.325848,0.319607
7443,0.331046,0.326393,0.329857,0.333873,0.321247,0.317648,0.320768,0.336872,0.337759,0.404913,...,0.316702,0.321365,0.301742,0.314704,0.327746,0.308792,0.326426,0.316952,0.319981,0.320292
11960,0.33008,0.320352,0.309292,0.340135,0.32616,0.327684,0.321666,0.320558,0.334606,0.334834,...,0.332677,0.331673,0.326157,0.316515,0.329188,0.307839,0.330976,0.317075,0.330202,0.320648
16411,0.328129,0.350113,0.329705,0.322218,0.329766,0.327733,0.319856,0.3628,0.318562,0.446889,...,0.320433,0.32906,0.332837,0.31594,0.327906,0.289245,0.325899,0.316986,0.332266,0.320676
21296,0.307056,0.315899,0.61,0.302002,0.322852,0.318631,0.33731,0.309436,0.339253,0.621716,...,0.299343,0.30631,0.328104,0.318141,0.323578,0.310778,0.299958,0.317464,0.311008,0.315589


In [57]:
predictions_df = predicted_rating_df.stack().reset_index()
predictions_df.head()

Unnamed: 0,item_id,level_1,0
0,6454,19lovelikecrazy95,0.317101
1,6454,1dianaoliver,0.316887
2,6454,3chuckleheads,0.320944
3,6454,4jess,0.31715
4,6454,7578042,0.317925


In [60]:
predictions_df.rename(columns = {'item_id': 'item_id', 
                                                  'level_1': 'user_id',
                                                 0: 'est_rating'}, inplace = True)

In [66]:
predictions_df['rating'] = data['rating']

In [64]:
predictions_df.head()

Unnamed: 0,item_id,user_id,est_rating,rating
0,6454,19lovelikecrazy95,0.317101,4.0
1,6454,1dianaoliver,0.316887,3.0
2,6454,3chuckleheads,0.320944,4.0
3,6454,4jess,0.31715,4.0
4,6454,7578042,0.317925,4.0


In [1]:
from sklearn.metrics.pairwise import linear_kernel

In [4]:
raw_data = pd.read_csv('/Users/judith/Data_science_projects/Springboard_AssignmentsJY/capstone_three/data/raw/df_modcloth.csv')

In [5]:
raw_data

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99888,154797,BernMarie,5,2019-06-26 21:15:13.165000+00:00,6.0,Just right,Large,Small&Large,Dresses,,2017,0
99889,77949,Sam,4,2019-06-26 23:22:29.633000+00:00,4.0,Slightly small,Small,Small&Large,Bottoms,,2014,2
99890,67194,Janice,5,2019-06-27 00:20:52.125000+00:00,,Just right,Small,Small&Large,Dresses,,2013,2
99891,71607,amy,3,2019-06-27 15:45:06.250000+00:00,,Slightly small,Small,Small&Large,Outerwear,Jack by BB Dakota,2016,2


In [14]:
def create_soup(x):
    return ' '.join(x['size']) + ' ' + ' '.join(x['user_attr'])

In [None]:
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [22]:
df = raw_data[['size', 'user_attr']]
df.dropna()

Unnamed: 0,size,user_attr
119,1.0,Small
121,2.0,Small
123,2.0,Small
125,3.0,Small
129,3.0,Small
...,...,...
99853,1.0,Small
99858,6.0,Large
99880,7.0,Large
99888,6.0,Large


In [25]:
df['xxx'] = df['size'].map(str) + df['user_attr'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
df

Unnamed: 0,size,user_attr,xxx
0,,Small,nanSmall
1,,,nannan
2,,Small,nanSmall
3,,,nannan
4,,Small,nanSmall
...,...,...,...
99888,6.0,Large,6.0Large
99889,4.0,Small,4.0Small
99890,,Small,nanSmall
99891,,Small,nanSmall


In [16]:
features= pd.read_csv(r'/Users/judith/Data_science_projects/Springboard_AssignmentsJY/capstone_three/data/processed/processed_products_data.csv')

In [17]:
features.head()

Unnamed: 0,size,year,split,fit_Slightly large,fit_Slightly small,fit_Very large,fit_Very small,user_attr_Small,model_attr_Small&Large,category_Dresses,category_Outerwear,category_Tops
0,0.275,0.222222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.725,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.275,0.222222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.725,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.275,0.222222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [18]:
df = raw_data[['size', 'year']]

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['xxx'])

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

NameError: name 'count_matrix' is not defined