# Recommender Systerm (推荐系统)

In [1]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.optimize as opt

In [2]:
root_dir = '/home/leedo/ML_Andrew_Ng/ML-exe/exe8-anomaly detection and recommendation/data'

In [3]:
data1 = sio.loadmat(os.path.join(root_dir,'ex8_movies.mat'))
data1.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [4]:
print(data1.get('Y').shape,data1.get('R').shape)

(1682, 943) (1682, 943)


In [5]:
data_Y = pd.DataFrame(data1.get('Y'))
data_R = pd.DataFrame(data1.get('R'))
data_Y.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,5,4,0,0,4,4,0,0,0,4,...,2,3,4,0,4,0,0,5,0,0
1,3,0,0,0,3,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,5
2,4,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
3,3,0,0,0,0,0,5,0,0,4,...,5,0,0,0,0,0,2,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data2 = sio.loadmat(os.path.join(root_dir,'ex8_movieParams.mat'))
data2.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [7]:
print(data2.get('X').shape,data2.get('Theta').shape)

(1682, 10) (943, 10)


In [8]:
data_X = pd.DataFrame(data2.get('X'))
theta = pd.DataFrame(data2.get('Theta'))
data_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.048686,-0.400232,1.194119,0.371128,0.407607,0.974407,-0.05841,0.861721,-0.69729,0.288746
1,0.780851,-0.385626,0.521198,0.227355,0.570109,0.641264,-0.550006,0.704021,-0.485835,-0.564624
2,0.641509,-0.547854,-0.083796,-0.598519,-0.017694,0.299736,-0.720807,0.838546,-0.694832,-1.134796
3,0.453618,-0.800218,0.680481,-0.081743,0.136601,0.907561,0.277682,0.3693,-1.261208,-0.235581
4,0.937538,0.10609,0.361953,0.086646,0.287505,0.518644,-0.056871,0.914573,-0.819334,-0.542847


## Collaborative filtering cost function

In [9]:
## X和theta都需要更新，因此把它们放在一起会更简单一些
def trans(X,theta):
    return np.concatenate((X.ravel(),theta.ravel()))

def detrans(mat,n_movies,n_features,nums):
    
    return mat[:n_movies*n_features].reshape(n_movies,n_features),\
            mat[n_movies*n_features:].reshape(nums,n_features)


def cofiCostFun(params,Y,R,n_features):

    n_movies,nums = Y.shape[0],Y.shape[1]
    X,theta = detrans(params,n_movies,n_features,nums)
    
    tmp = np.multiply(X @ theta.T-Y,R)
    
    return 0.5*np.sum(pow(tmp,2))

def reg_cofiCostFun(params,Y,R,n_features,lamb):
    
    reg = lamb*0.5*np.sum(pow(params,2))

    return cofiCostFun(params,Y,R,n_features)+ reg


def gradient(params,Y,R,n_features):
    
    n_movies,nums = Y.shape[0],Y.shape[1]
    X,theta = detrans(params,n_movies,n_features,nums)
    
    # 1682*10
    temp1 = np.multiply((X @ theta.T-Y), R) @ theta

    #943*10
    temp2 = np.multiply((X @ theta.T-Y), R).T @ X
    
    return trans(temp1,temp2)


def reg_gradient(params,Y,R,n_features,lamb):
      
    grad = gradient(params,Y,R,n_features)
    
    reg_item = lamb * params
    
    return grad + reg_item

In [10]:
#subsets Parameters :num_users = 4 num_movies = 5 num_features = 3
X_sub = data_X.values[:5,:3]
Y_sub = data_Y.values[:5,:4]
R_sub = data_R.values[:5,:4]
theta_sub = theta.values[:4,:3]

params = trans(X_sub,theta_sub)
cofiCostFun(params,Y_sub,R_sub,3)

22.224603725685675

In [11]:
params = trans(data_X.values,theta.values)

print(cofiCostFun(params,data_Y.values,data_R.values,n_features = 10))

print(reg_gradient(params,data_Y.values,data_R.values,10,1))

27918.64012454421
[-5.21315594  2.0591285  -5.68148384 ... -5.27650042  4.22109195
  2.11819114]


In [12]:
movie_list = []

with open(os.path.join(root_dir,'movie_ids.txt'),encoding='latin-1') as f:
    for line in f:
        temp = line.strip().split(' ')
        movie_list.append(' '.join(temp[1:]))
movie_list[:10]

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Twelve Monkeys (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Richard III (1995)']

In [13]:
my_ratings = np.zeros(1682)
my_ratings[1] = 3
my_ratings[128] = 5
my_ratings[176] = 2
my_ratings[238] = 4
my_ratings[271] = 3
my_ratings[384] = 5
my_ratings[408] = 1
my_ratings[597] = 5
my_ratings[872] = 2
my_ratings[1297] = 3

In [14]:
Y = np.insert(data_Y.values,0,my_ratings,axis = 1)
Y.shape

(1682, 944)

In [15]:
R = np.insert(data_R.values,0,my_ratings != 0, axis=1)
R.shape

(1682, 944)

In [16]:
X = np.random.standard_normal((1682, 10))
theta = np.random.standard_normal((944, 10))

X.shape, theta.shape

((1682, 10), (944, 10))

In [17]:
params = trans(X,theta)
Y_norm = Y - Y.mean()

In [18]:
res = opt.minimize(fun = reg_cofiCostFun , 
                   x0 = params, 
                   args = (Y_norm,R,10,5),
                   method = 'TNC',
                   jac=reg_gradient)

In [19]:
res

     fun: 49240.51001489318
     jac: array([-1.36285205e-08,  5.07696067e-07,  4.97967938e-07, ...,
       -1.07270492e-06,  8.59392022e-07,  8.41116132e-08])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 1367
     nit: 51
  status: 1
 success: True
       x: array([-0.89811259, -0.51455151,  0.56158756, ...,  0.10985945,
       -1.03295854,  0.21509494])

In [20]:
X_final,theta_final = detrans(res.x,1682,10,944)
X_final.shape

(1682, 10)

In [21]:
y_pred = X_final @ theta_final.T
my_prediction = y_pred[:,0] + Y.mean()

idx = np.argsort(my_prediction)[::-1] ##返回top10的索引，
print(idx[:10])
print(my_prediction[idx][:10])

[312  49  95 173 301 171 299 168 175 180]
[3.93471659 3.86491497 3.81692885 3.71916257 3.69541733 3.68800515
 3.68725836 3.64966028 3.64447157 3.62395749]


In [22]:
for i in idx[:10]:
    print(movie_list[i])

Titanic (1997)
Star Wars (1977)
Terminator 2: Judgment Day (1991)
Raiders of the Lost Ark (1981)
L.A. Confidential (1997)
Empire Strikes Back, The (1980)
Air Force One (1997)
Wrong Trousers, The (1993)
Aliens (1986)
Return of the Jedi (1983)
