In [2]:
import pandas as pd
import math
import numpy as np
import scipy
import pymongo
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import os
import warnings
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings('ignore')
%matplotlib inline
data = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/KaDo.csv')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_rows', 10)

# We play with the data and resshape out DFs according to our needs

In [3]:
# create qty bought per client
data['QTY'] = data.groupby(['LIBELLE', 'CLI_ID'])['LIBELLE'].transform('size')
data = data.drop_duplicates(['LIBELLE', 'CLI_ID'],keep= 'last')

# create product ids
dd = data[['LIBELLE']]
dd = dd.drop_duplicates(['LIBELLE'],keep= 'last')
dd.reset_index(drop=True, inplace=True)
dd.insert(0, 'PROD_ID', range(1,len(dd)+1))

# create price categories
data = pd.merge(data, dd, on="LIBELLE")

def prix_cat(x):
    if x <= 5:
        return "0-5"
    if x <= 10 and x > 5:
        return "5-10"
    if x <= 15 and x > 10:
        return "10-15"
    if x <= 20 and x > 15:
        return "15-20"
    if x > 20:
        return "20-inf"

data['PRIX_CAT'] = data['PRIX_NET'].apply(lambda x: prix_cat(x)) 
data

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID,QTY,PROD_ID,PRIX_CAT
0,35592159,10,1.66,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281,2,1297,0-5
1,34358800,6,3.00,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,69813934,1,1297,0-5
2,34419587,6,1.95,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,372658871,1,1297,0-5
3,34621911,7,1.50,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,385760870,1,1297,0-5
4,35894519,11,1.50,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,626801858,1,1297,0-5
...,...,...,...,...,...,...,...,...,...,...,...
5911144,36496993,12,4.95,MAQUILLAGE,MAQ_TEINT Font teint,MAQ_TEINT,VOILE TT LUM3 BEIGE SOLEIL T30ML,994960174,1,293,0-5
5911145,33616009,3,5.90,MAQUILLAGE,MAQ_Autres,MAQ_AUTRE,BIJOU PEAU FLEURS DOREES A/T MAI11 LU3,995025793,1,342,5-10
5911146,36512280,12,1.65,MAQUILLAGE,MAQ_ONG Vernis LUM,MAQ_ONGLES,VAO ROSE A L'EAU A/T MAI10 LUM3 3ML,995197807,1,349,0-5
5911147,36056792,11,14.90,MAQUILLAGE,MAQ_YEUX Fard,MAQ_YEUX_MASCA_EYEL_FARD,PAL YX AMB/CANN/CHAT NOEL11 A/T LU3 3X1G,995330812,1,356,10-15


<br/><br/><br/><br/>
# We split the main dataset into two sets:
    - items_data for all the metadata of each product
    - client_data for all the data regarding each client's purchases and rating

In [4]:
items_data = data[['PRIX_NET','FAMILLE','UNIVERS','MAILLE','LIBELLE','PROD_ID','PRIX_CAT']]
items_data = items_data.drop_duplicates(['PROD_ID'],keep= 'last')
items_data.reset_index(drop=True, inplace=True)
items_data

Unnamed: 0,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,PROD_ID,PRIX_CAT
0,1.67,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1297,0-5
1,8.95,SOINS DU VISAGE,VIS_CJOUR Jeunes Specifique,VIS_JEUNE_ET_LEVRE,CR JR PARF BIO.SPE AC.SENT.50ML,890,5-10
2,5.95,SOINS DU VISAGE,VIS_DEMAQ AAAR,VIS_AAAR_DEMAQLOTION,EAU MICELLAIRE 3 THES FL200ML,781,5-10
3,1.95,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 TIARE FL 200ML,1315,0-5
4,15.95,PARFUMAGE,PARF_EAUX PARFUMS,PARF_PARFUM,EDT UN MATIN AU JARDIN 100ML MUGUET,1401,15-20
...,...,...,...,...,...,...,...
1479,4.95,MAQUILLAGE,MAQ_TEINT Font teint,MAQ_TEINT,VOILE TT LUM3 BEIGE SOLEIL T30ML,293,0-5
1480,5.90,MAQUILLAGE,MAQ_Autres,MAQ_AUTRE,BIJOU PEAU FLEURS DOREES A/T MAI11 LU3,342,5-10
1481,1.65,MAQUILLAGE,MAQ_ONG Vernis LUM,MAQ_ONGLES,VAO ROSE A L'EAU A/T MAI10 LUM3 3ML,349,0-5
1482,14.90,MAQUILLAGE,MAQ_YEUX Fard,MAQ_YEUX_MASCA_EYEL_FARD,PAL YX AMB/CANN/CHAT NOEL11 A/T LU3 3X1G,356,10-15


In [5]:
#create ratings based on qty purchase (arbitrary values)
def t(val):
    if val == 1:
        return 2
    if val == 2:
        return 3
    if val == 3:
        return 4
    if val >= 4:
        return 5
    
    

client_data = data[['CLI_ID', 'PROD_ID', 'QTY']]
client_data.reset_index(drop=True, inplace=True)
client_data['RATING'] = client_data['QTY'].apply(lambda x: t(x)) 
client_data = client_data.sort_values(by=['CLI_ID'])
client_data.reset_index(drop=True, inplace=True)
client_data


Unnamed: 0,CLI_ID,PROD_ID,QTY,RATING
0,1490281,1297,2,3
1,1490281,781,1,2
2,1490281,1315,1,2
3,1490281,890,1,2
4,13290776,1437,1,2
...,...,...,...,...
5911144,997385337,1470,1,2
5911145,997385337,1484,1,2
5911146,997385337,1476,4,5
5911147,997385337,1467,1,2


<br/><br/><br/><br/>
#### We split our client_data dataset with a conventionnal 80/20 split

In [6]:

# no randomize to not break user profiles
p = round(len(client_data)*0.8)
interactions_train_df = client_data[:p]
interactions_test_df = client_data[p:]



<br/><br/><br/><br/><br/><br/><br/>
# ##########################################
# COLLABORATIVE FILTERING ################
# ##########################################

#### popular latent factor model named Singular Value Decomposition (SVD)

In [7]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = client_data.pivot(index='CLI_ID', 
                                                          columns='PROD_ID', 
                                                          values='RATING').fillna(0)

users_items_pivot_matrix_df.head(10)

PROD_ID,1,2,3,4,5,6,7,8,9,10,...,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484
CLI_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1490281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13290776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20163348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20200041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20561854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20727324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20791601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21046542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21239163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21351166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[1490281,
 13290776,
 20163348,
 20200041,
 20561854,
 20727324,
 20791601,
 21046542,
 21239163,
 21351166]

In [10]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<853514x1484 sparse matrix of type '<class 'numpy.float64'>'
	with 5911149 stored elements in Compressed Sparse Row format>

In [11]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [12]:
U.shape

(853514, 15)

In [13]:
Vt.shape

(15, 1484)

In [14]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

<br/><br/><br/><br/>
# After the factorization, we try to to reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It was generated predictions for items the user have not yet interaction, which we will exploit for recommendations.

In [15]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-1.95979204e-06, -2.48440287e-08, -7.59857232e-07, ...,
         9.82836896e-04,  8.48466052e-04,  9.37980488e-04],
       [ 4.96589109e-06,  2.27505388e-06, -1.64149320e-07, ...,
         3.14231889e-03,  4.89162445e-03,  1.44752399e-03],
       [ 4.12316377e-08,  1.93952651e-08,  1.93021984e-08, ...,
         3.57634659e-05,  6.65670104e-05,  1.29624773e-05],
       ...,
       [-1.42337706e-05,  4.62947968e-06,  3.14674400e-05, ...,
         1.13460835e-02,  3.62659688e-02,  5.15648915e-03],
       [ 3.56133526e-05,  8.23952136e-06, -8.95536480e-06, ...,
         2.76232366e-02,  4.02545750e-02,  6.94526902e-03],
       [ 9.86835619e-05,  1.13244484e-05, -1.36278623e-06, ...,
         2.47123091e-02,  3.06561365e-02,  9.01511629e-03]])

In [16]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [17]:
all_user_predicted_ratings_norm

array([[0.24312363, 0.24312378, 0.24312372, ..., 0.2432005 , 0.24319001,
        0.243197  ],
       [0.24312417, 0.24312396, 0.24312377, ..., 0.24336907, 0.24350563,
        0.24323678],
       [0.24312379, 0.24312378, 0.24312378, ..., 0.24312657, 0.24312898,
        0.24312479],
       ...,
       [0.24312267, 0.24312414, 0.24312624, ..., 0.24400947, 0.24595475,
        0.2435263 ],
       [0.24312656, 0.24312443, 0.24312308, ..., 0.24528008, 0.2462661 ,
        0.24366594],
       [0.24313149, 0.24312467, 0.24312368, ..., 0.24505285, 0.24551684,
        0.24382751]])

# Now that our prediction matrix is done, we keep the top 5 predictions for each client and store it into our cloud db

- has already been ran, run only once

In [None]:
'''
# insert items in mongo atlas
client = pymongo.MongoClient("mongodb+srv://dat:dat@clusterdat2.afein.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
db = client['datdb']
dataset = db['model2predictions']

# run once!
for i in range(0, 853514):
    t = pd.DataFrame(all_user_predicted_ratings_norm[(0+i):(1+i)], columns = users_items_pivot_matrix_df.columns, index=users_ids[(0+i):(1+i)]).transpose()
    t = t.astype(str)
    t = t.sort_values(by=t.columns[0], ascending=False)
    t = t.head(5)
    #print(t)
    #   t.columns[0]
    thisdict = {
      "clientId": str(t.columns[0]),
      "recommendedItems": t.index.tolist()
    }
    x = dataset.insert_one(thisdict)
'''

# Func to be used in backend API

In [5]:
# To get a user's item predictions : 
import pymongo

def nlpPredict(userId):
    client = pymongo.MongoClient("mongodb+srv://dat:dat@clusterdat2.afein.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    db = client['datdb']
    collection = db['model2predictions']
    cursor = collection.find_one({'clientId': userId})
    return cursor

# Test with client n° 1490281

In [6]:
x = svdPredict('1490281')
x

{'_id': ObjectId('61d6d1c487bf3a793a90e5c0'),
 'clientId': '1490281',
 'recommendedItems': [1304, 1374, 1309, 1481, 1297]}