## Matrix Factorization with SVD - BINARY RATINGS

https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101

In [1]:
import import_ipynb

In [2]:
import pandas as pd
import scipy.sparse as sps
import numpy as np
from scipy.sparse.linalg import svds
from time import time
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision
import random

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


# Define Functions for  SVD and Predict SVD

In [3]:
#--------------------------------------
# RETURN DECOMPOSITION MATRIXES
#--------------------------------------

def SVD(num_factors):
    NUMBER_OF_FACTORS_MF = num_factors
    MATRIX =  M.asfptype()
    U, sigma, Vt = svds(MATRIX, k = NUMBER_OF_FACTORS_MF)
    sigma = np.diag(sigma)
    return U, sigma, Vt

In [4]:
#--------------------------------------------------------------------
# PREDICT top_n TRACKS FOR A PID AND EVALUATE AGAINST GROUND TRUTH
#--------------------------------------------------------------------

def SVD_predict_and_evaluate_top_n(pid, U, sigma, Vt, top_n):
    """
    input
        pid
        decomposition matrixes
        top_n to reccommend
    return
        top_n predicted track_ids
        ground_truth : track_ids in the hold_out
        R_Prec
    """
    train_array_track_ids = track_id_array[M[pid].toarray()[0].astype(bool)]
    predicted = np.dot(np.dot(U[pid,:], sigma), Vt)
    pred = np.flipud(predicted.argsort())
    L_pred = pred[:top_n+len(train_array_track_ids)]
    L_pred = [el for el in L_pred if el not in train_array_track_ids]
    L_pred = L_pred[:top_n]
    
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,1]
    
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    
    res = [int(el in ground_truth) for el in L_pred]
    
    NDCG = nDCG(res)[1]
    
    return L_pred, ground_truth, R_Prec, NDCG, res

In [5]:
#-------------------------------
# SAVE SVD EVALUATION RESULTS
#-------------------------------

def save_SVD_res_k_n(U, sigma, Vt, k = 15, n = 10):
    """
    k = number of factors
    n = number of random lists to predict
    """
    time0=time()
    RES={}
    for i,pid in enumerate(random.sample(evaluation_pids,n)):
        predictions=SVD_predict_and_evaluate_top_n(pid, U, sigma, Vt, 500)
        RES[pid] = [predictions[2], predictions[3]]
        if i%500==0:
            print(i)
    print(time()-time0)
    df = pd.DataFrame(RES).transpose().reset_index()
    df.columns=['pid','R-Precision','nDCG']
    df['rating'] = 'binary'
    df['model'] = f'SVD_{k}'
    df.to_csv(f'../evaluation/SVD_binary{k}_{n}.csv', index = None)
    return RES

# Read Data

In [6]:
file_path = '../data-processed/full-data/pid-track-binary-rating-train-data.csv'

In [7]:
data = pd.read_csv(file_path)
data.dtypes

pid               int64
track_uri        object
binary_rating     int64
dtype: object

In [8]:
data.head()

Unnamed: 0,pid,track_uri,binary_rating
0,491000,spotify:track:3giQ7393501IRNrd8iHugf,1
1,491000,spotify:track:3jpcVaeyNjWgjqIxAiWasz,1
2,491000,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,1
3,491000,spotify:track:7gXpcXwtmEiQzskYJmtGgk,1
4,491000,spotify:track:5wtIWwOtowY2howCZ7Veq2,1


In [9]:
tracks = list(data.track_uri.unique())

In [10]:
D_tracks = {}
n=0
for track in tracks:
    D_tracks[track] = n
    n+=1
    
D_tracks_reverse = {}
n=0
for k,i in D_tracks.items():
    D_tracks_reverse[i] = k
    
data['track_id'] = data.track_uri.map(D_tracks)

In [11]:
data.head()

Unnamed: 0,pid,track_uri,binary_rating,track_id
0,491000,spotify:track:3giQ7393501IRNrd8iHugf,1,0
1,491000,spotify:track:3jpcVaeyNjWgjqIxAiWasz,1,1
2,491000,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,1,2
3,491000,spotify:track:7gXpcXwtmEiQzskYJmtGgk,1,3
4,491000,spotify:track:5wtIWwOtowY2howCZ7Veq2,1,4


In [12]:
data.dtypes

pid               int64
track_uri        object
binary_rating     int64
track_id          int64
dtype: object

In [13]:
evaluation_set = pd.read_csv('../data-processed/full-data/evaluation-pids-ground-truth.csv')
evaluation_set['track_id'] = evaluation_set['track_uri'].map(D_tracks)

In [14]:
ev_set = evaluation_set[evaluation_set['hold_out'] == 1][['pid','track_id','hold_out']]
ev_set = ev_set[ev_set.track_id.isnull()==False]
evaluation_pids = list(ev_set.pid.unique())
ev_set.track_id = ev_set.track_id.astype(int)

In [15]:
ev_set_arr = ev_set.to_numpy()

# Define sparce matrix

In [16]:
M = sps.csr_matrix((data.binary_rating, (data.pid, data.track_id)))

In [17]:
M.shape[1]

1996586

# Train - Predict - Evaluate

In [18]:
track_id_array = np.arange(M.shape[1])

### Save evaluation - needs to be uncommented for chosen k

In [19]:
n=1000

### k=15

In [20]:
# k=15
# U, sigma, Vt = SVD(k)
# U.shape, sigma.shape, Vt.shape

In [21]:
# df = pd.DataFrame(save_SVD_res_k_n(U, sigma, Vt, k, n)).transpose()
# df.describe()

### k=25

In [22]:
# k=25
# U, sigma, Vt = SVD(k)
# U.shape, sigma.shape, Vt.shape

In [23]:
# df = pd.DataFrame(save_SVD_res_k_n(U, sigma, Vt, k, n)).transpose()
# df.describe()

### k=35

In [24]:
# k=35
# U, sigma, Vt = SVD(k)
# U.shape, sigma.shape, Vt.shape

In [25]:
# df = pd.DataFrame(save_SVD_res_k_n(U, sigma, Vt, k, n)).transpose()
# df.describe()

### k=45

In [26]:
# k=45
# U, sigma, Vt = SVD(k)
# U.shape, sigma.shape, Vt.shape

In [27]:
# df = pd.DataFrame(save_SVD_res_k_n(U, sigma, Vt, k, n)).transpose()
# df.describe()

### k=50

In [28]:
# k=50
# U, sigma, Vt = SVD(k)
# save=save_SVD_res_k_n(U, sigma, Vt, k, n)
# df = pd.DataFrame(save[0]).transpose()
# df.describe()

### k=75

In [29]:
# k=75
# n=10000
# U, sigma, Vt = SVD(k)
# df = pd.DataFrame(save_SVD_res_k_n(U, sigma, Vt, k, n)).transpose()
# df.describe()