# TME4 Recommendation

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import random

from sklearn import cross_validation
from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error
from math import sqrt

import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Recharge de données MovieLens

In [3]:
def loadMovieLens(path='data/movielens'):
    # Get movie titles
    movies={}
    for line in open(path+'/u.item', encoding = "ISO-8859-1"):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
    
    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

#perfs = loadMovieLens()
#print (perfs)
## {user_id :{'item_name': rating, 'item_name': rating}, user_id:{.....} }

#La fonction Root Mean Squared Error
def rmse(prediction, X):
    prediction = prediction[X.nonzero()].flatten() 
    X = X[X.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, X))


path='data/movielens'
info = open(path+'/u.info')
n_users = info.readline().split(' ')[0]
n_items = info.readline().split(' ')[0]
n_ratings = info.readline().split(' ')[0]
#n_users = 943
#n_items = 1682
nbr_users = int(n_users)
nbr_items = int(n_items)
#Construction du tableau de rang
Ratings = np.zeros((nbr_users,nbr_items))
#le format de <<u.data>> user id | item id | rating | timestamp  
for line in open(path+'/u.data'):
    (idUser, idItem, rating, ts) = line.split('\t')
    Ratings[int(idUser)-1][int(idItem)-1]=rating
print ("Le tableau de rang : ")
print (Ratings)

'\nlistsNbrSingular = []\nfor i in range(1,100):\n    u, s, vt = svds(train_data, k = i)\n    s_diag_matrix=np.diag(s)\n    X_pred = np.dot(np.dot(u, s_diag_matrix), vt)\n    listsNbrSingular.append(rmse(X_pred, train_data))\n    \nprint (listsNbrSingular)\n'

# Filtrage Collaborative

# Version SVD

In [None]:
#Calcule le en 
listsNbrSingular = []
for i in range(1,100):
    u, s, vt = svds(Ratings, k = i)
    s_diag_matrix=np.diag(s)
    X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
    listsNbrSingular.append(rmse(X_pred, Ratings))

print ("Le tableau de rang prédit en utilisant SVD : ")
print (X_pred)

In [None]:
plt.plot(range(len(listsNbrSingular)), listsNbrSingular, marker='.')
plt.title(' Matrix factorization by SVD')
plt.xlabel('Number of hidden features')
plt.ylabel('RMSE')
plt.legend()
plt.grid()
plt.show()

# Version Modèle Classique L2

In [4]:
plt.close()
K = 5
P, s, Q = svds(Ratings, K)

In [4]:
ErreursClassique = []
def modele_classique_L2(R, P, Q, K, steps=100, alpha=0.005, lamda=0.02):
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    #descente de gradient stochastique
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * ( eij * Q[k][j] - lamda * P[i][k] ) 
                        Q[k][j] = Q[k][j] + alpha * ( eij * P[i][k] - lamda * Q[k][j] )
        eR = np.dot(P,Q)
        e = rmse(eR, R)
        ErreursClassique.append(e)
    return P, Q, eR

nP, nQ, nR= modele_classique_L2(Ratings, P, Q, K)
print ("Le tableau de rang prédit en utilisant le Modèle Classique L2 : ")
print (nR)

2.70155394066
1.42062509299
1.13408792312
1.03376478171
0.98622221862
0.958774154745
0.940337148685
0.926597763023
0.915692657407
0.906700256713
0.899087552571
0.892509662872
0.886731630223
0.881588990233
0.876963058013
0.87276503382
0.868926328719
0.865392873895
0.862121659523
0.85907840709
0.856235812103
0.853572111063
0.851069877201
0.848715008884
0.846495894228
0.84440274025
0.84242705407
0.840561260464
0.838798436162
0.837132138324
0.835556303677
0.834065196315
0.832653385526
0.831315739436
0.830047424758
0.828843906741
0.827700946394
0.826614593942
0.825581178701
0.824597296033
0.823659792299
0.822765748659
0.821912464445
0.821097440678
0.820318364141
0.819573092275
0.818859639054
0.818176161933
0.817520949858
0.816892412331
0.816289069467
0.815709542973
0.815152547964
0.814616885555
0.814101436126
0.813605153211
0.813127057936
0.812666233941
0.812221822744
0.811793019496
0.811379069093
0.810979262594
0.810592933933
0.810219456893
0.809858242307
0.809508735482
0.809170413818
0.80

# Version Modèle avec biais utilisateur et items

In [6]:
ErreursBiais = []
def modele_biais_users_items(R, P, Q, K, steps=50, alpha=0.005, lamda=0.02):
    miu = R.sum()/float(n_ratings)
    eR = np.zeros((nbr_users,nbr_items))
    bu=[]
    for i in range(nbr_users):
        bu.append(R[i].sum()/(len(R[i].nonzero()[0])*1.0) - miu)
    bit=[]
    for j in range(nbr_items):
        bit.append(R[:,j].sum()/(len(R[:,j].nonzero()[0])*1.0) - miu)
    for step in range(steps):
        for i in range(len(R)):
            bi = bu[i]
            for j in range(len(R[i])):
                bj = bit[j]
                if R[i][j] > 0:
                    #descente de gradient stochastique
                    eij = R[i][j] - (miu + bi + bj + np.dot(P[i,:],Q[:,j]))
                    bi = bi + alpha * (eij - lamda * bi)
                    bj = bj + alpha * (eij - lamda * bj)
                    miu = miu + alpha * (eij - lamda * miu)
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * ( eij * Q[k][j] - lamda * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * ( eij * P[i][k] - lamda * Q[k][j])
                eR[i][j] = miu + bi + bj + np.dot(P[i,:],Q[:,j]) 
        e = rmse(eR, R)
        ErreursBiais.append(e)
    return P, Q, eR

P = np.random.rand(nbr_users,K)
Q = np.random.rand(K,nbr_items)
nP, nQ, nR= modele_biais_users_items(Ratings, P, Q, K)
print (nR)

plt.plot(range(len(ErreursClassique)), ErreursClassique, marker='.', label='Modele classique')
plt.plot(range(len(ErreursBiais)), ErreursBiais, marker='.', label='Biais users et items')
plt.title(' Matrix factorization')
plt.xlabel('Number of Epochs')
plt.ylabel('RMSE')
plt.legend()
plt.grid()
plt.show()


0.95308187752
[[ 6.29054813  5.16570656  4.66832609 ...,  1.80195862  2.60468747
   3.11052411]
 [ 5.01060397  4.68389322  3.45974871 ...,  2.03338019  3.19743165
   3.1962107 ]
 [ 4.26961307  3.75713999  2.70475468 ...,  1.26746554  2.36789496
   2.50554536]
 ..., 
 [ 4.42070269  3.85209334  3.78105471 ...,  2.17837706  2.93914336
   3.28312649]
 [ 5.36822899  4.95666814  4.60637186 ...,  2.34821884  3.15398639
   3.70621178]
 [ 2.97099663  2.61925815  2.11533593 ...,  1.6940496   2.66555658
   3.09590286]]
0.921201989638
[[ 3.69630101  3.02964268  2.92771917 ...,  1.80194875  2.61068425
   3.11834958]
 [ 3.72980628  3.6965172   2.71799531 ...,  1.77216474  2.92859018
   2.93128623]
 [ 3.09611586  2.97324642  2.19857371 ...,  1.05699847  2.14091549
   2.29338857]
 ..., 
 [ 4.41291481  3.76415189  3.75922161 ...,  2.14892152  2.91191388
   3.24577342]
 [ 5.03890524  4.52062525  4.31632231 ...,  2.33233025  3.1548021
   3.63286746]
 [ 3.36857193  3.0017757   2.52730669 ...,  1.67174611 

"\nplt.plot(range(len(ErreursClassique)), ErreursClassique, marker='.', label='Modele classique')\nplt.plot(range(len(ErreursBiais)), ErreursBiais, marker='.', label='Biais users et items')\nplt.title(' Matrix factorization with bias')\nplt.xlabel('Number of Epochs')\nplt.ylabel('RMSE')\nplt.legend()\nplt.grid()\nplt.show()\n"