In [2]:
import os
import pickle
import numpy as np
import pandas as pd

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import NMF

## 1. Load rating data from MongoDB

In [3]:
uri = "mongodb+srv://Admin:1234@mlproject.obivlrq.mongodb.net/?retryWrites=true&w=majority"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

db = client['E-commerce']  # Access a specific database
collection = db['Rating']  # Access a specific collection

Pinged your deployment. You successfully connected to MongoDB!


In [4]:
# Get rating data from the collection and transform it to dataframe
ratings = pd.DataFrame(list(collection.find())).drop(columns='_id').set_index("username")
ratings

Unnamed: 0_level_0,B0013HO2XK,B004UZCMRK,B008X10YRC,B00H7KTRO6,B015K4DKNY,B000HE5DUG,B0019ZDHVY,B004EXMS4U,B008IBOG7G,B01F47B8AO,...,B0012NI67S,B0011EX934,B0011MFOHA,B000XKUPHU,B000ZHZQAW,B0010DQ9TM,B0070Z9X5Q,B0013J1UTM,B0017PUT58,B000YC8NYY
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1JU8UJG9AXUBJ,5.0,5.0,5.0,5.0,5.0,,,,,,...,,,,,,,,,,
A35PPLVIPZLU36,,,,,,5.0,5.0,5.0,5.0,5.0,...,,,,,,,,,,
A2RLD0BYELLX30,,,,,,,,,,,...,,,,,,,,,,
A1HQ8WATO130TZ,,,,,,,,,,,...,,,,,,,,,,
A1OL2P82M1KFB3,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mark,,,,,,,,,,,...,,,,,,,,,,
,,,,,,,,,,,...,,,,,,,,,,
,,,,,,,,,,,...,,,,,,,,,,
,,,,,,,,,,,...,,,,,,,,,,


In [19]:
user = ratings.loc[""].dropna(how='all').iloc[0].dropna()
user

B00004RH8I    3.0
B00004TBJD    1.0
B00004RHKU    4.0
B00004RHKW    5.0
B00004R9U1    2.0
Name: , dtype: float64

In [27]:
collection.find_one({'username': 'Parun'})

In [22]:
from combiner import Combiner
similarity_mtx = pd.read_csv("./experiment_outputs/product_sims_train.csv", index_col=0)
factorizer = pickle.load(open('./experiment_outputs/mf_factorizer_train.pkl', 'rb'))

In [23]:
product_list = similarity_mtx.columns.tolist()

In [24]:
user = pd.Series(user, index=product_list, name="").fillna(0)
user

B0000223SI    0.0
B0000223SK    0.0
B0000223UV    0.0
B00002246J    0.0
B0000224J0    0.0
             ... 
B01HCQSHNG    0.0
B01HCVJ3K2    0.0
B01HDXZR5E    0.0
B01HDYEAOW    0.0
B01HEQVQAK    0.0
Name: , Length: 5327, dtype: float64

In [26]:
combiner = Combiner(similarity_mtx.to_numpy(), factorizer)
combiner.ingest(user.fillna(0))
combiner.make_recommendations("", top_n=5)

B005B8LZ14    0.533891
B00004RHAO    0.313469
B000WGHZYG    0.258504
B006WMRVS4    0.221883
B001383CQW    0.182768
Name: , dtype: float64

## Train new model

### IBCF training

In [4]:
distance_mtx = squareform(pdist(ratings.T, 'cosine'))
similarity_mtx = 1 - distance_mtx
similarity_mtx = np.nan_to_num(similarity_mtx)
similarity_df = pd.DataFrame(similarity_mtx, index=ratings.columns, columns=ratings.columns)
similarity_df

Unnamed: 0,B0000223SI,B0000223SK,B0000223UV,B00002246J,B0000224J0,B0000224MY,B0000225HB,B0000225HD,B0000225IO,B00002N6FE,...,B01H6J5QYC,B01HB6AOFG,B01HBPHSII,B01HBZYFT8,B01HCFJC0Y,B01HCQSHNG,B01HCVJ3K2,B01HDXZR5E,B01HDYEAOW,B01HEQVQAK
B0000223SI,1.000000,0.378856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000223SK,0.378856,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000223UV,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B00002246J,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000224J0,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01HCQSHNG,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000,0.0,0.0,0.000000
B01HCVJ3K2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.140785,0.0,0.0,0.0,1.000000,0.0,0.0,0.066939
B01HDXZR5E,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.000000
B01HDYEAOW,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.000000


### MF Training

In [5]:
factorizer = NMF(n_components=100, max_iter=1000, random_state=42)
factorizer.fit(ratings)

### Overwrite old models

In [None]:
os.makedirs('./experiment_outputs', exist_ok=True)
similarity_df.to_csv("./experiment_outputs/product_sims_train.csv")

In [None]:
with open('./experiment_outputs/mf_factorizer_train.pkl', 'wb') as f:
    pickle.dump(factorizer, f)