In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import scipy.sparse as sp
from scipy.sparse.linalg import svds


## 1. Loading the data set

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('/Users/zhaoxiangyu/Desktop/ml-100k/u.data', sep='\t', names=header)

In [3]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)

Number of users = 943 | Number of movies = 1682


##  2.Split the data into training and testing

In [4]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.20)



## 3.Fiiling in the matrix from traing and testing data

In [5]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

## 4.Running the model

In [6]:
#### Calculate and get the cosine similarity between users.
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

In [7]:
#### Function to calculate mean of row data except "0". 
def non_zero_mean(x):
    return x[x != 0].mean()

#### Function to predict the target user's rating.
def predict(ratings, similarity, type):
    if type == 'user':
        mean_user_rating = np.apply_along_axis(non_zero_mean, axis = 1, arr = ratings)
       #### ratings_diff is the normalized matrix data
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        ratings_diff = np.where(ratings_diff==-mean_user_rating[:, np.newaxis],0,ratings_diff)
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [8]:
pred=predict(train_data_matrix, user_similarity, type='user')

#### This is the prediction result matrix from training matrix and similarity matrix.

In [9]:
pred

array([[ 3.71437318,  3.59241333,  3.58896429, ...,  3.61538462,
         3.61538462,  3.61511958],
       [ 3.8671127 ,  3.72609137,  3.72583341, ...,  3.75510204,
         3.75510204,  3.75465709],
       [ 2.89491306,  2.75171253,  2.74908764, ...,  2.7804878 ,
         2.7804878 ,  2.78005192],
       ..., 
       [ 4.16456666,  4.03897108,  4.03774203, ...,  4.06666667,
         4.06666667,  4.06625615],
       [ 4.44581943,  4.3118822 ,  4.30750243, ...,  4.33823529,
         4.33823529,  4.33783725],
       [ 3.54444169,  3.42147862,  3.41882179, ...,  3.44680851,
         3.44680851,  3.44651944]])

## 5.Testing the model and calculate the metrics.

In [10]:
test=np.where(test_data_matrix == 0, 0, pred)
test

array([[ 3.71437318,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
df2=pd.DataFrame(test)

In [12]:
#### Function to filling the predition results in the original table
#### from prediciton matrix
def add_pred(x):
    return df2.iloc[int(x[0]-1),int(x[1]-1)]
pred_col = df.apply(add_pred, axis = 1)

## 6.Recommend top 8 movies to users according to prediciton ratings.

In [13]:
df["pred"] = pred_col
df3=df[df['pred']>=4]
REC=df3.sort_values('pred',ascending=False).groupby('user_id').head(8)
(REC.sort_values('user_id')['rating']>=4).sum()

1230

#### Return the Precision of mdel with recommending top 8 movies

In [14]:
(REC.sort_values('user_id')['rating']>=4).sum()*1.0/len(REC.index)

0.83844580777096112

#### Return the table to compare prediction ratings and true ratings

In [15]:
REC.sort_values('user_id',ascending=True)

Unnamed: 0,user_id,item_id,rating,timestamp,pred
16305,4,50,5,892003526,4.576341
18930,4,354,5,892002353,4.195353
12151,4,294,5,892004409,4.049674
71055,4,362,5,892002352,4.191094
72875,7,427,5,891352220,4.075145
19720,7,515,3,891350757,4.074779
6324,7,168,5,891351509,4.083443
18292,7,480,4,891352093,4.050627
10422,7,210,4,891352904,4.060145
32677,7,64,5,891350756,4.147364


## 7.Regardless Top movies, just calculating overal  metrics.

#### This is precision:

In [16]:
pred_true=df.iloc[np.where(df['pred']>=4)[0],:]
len(np.where(pred_true['rating']>=4)[0])*1.0/pred_true.shape[0]

0.7999316005471956

#### This is recall:

In [17]:
h=df.iloc[np.where(df['pred']!=0)[0],:]
len(np.where(h['pred']>=4)[0])*1.0/len(np.where(h['rating']>=4)[0])

0.26715395157606214

## 8.For better performance, tring to run model only based on users who rated at least n movies.

In [18]:
df_by_user_id=df.groupby('user_id').size()
hot=df_by_user_id.sort_values(ascending=False)

In [25]:
def precision_least(i):
    dfplus=df_by_user_id.iloc[np.where(df_by_user_id>=i)[0]]
    df_least=df.iloc[np.where(df['user_id'].isin(dfplus.index))]    
    pred_true=df_least.iloc[np.where(df_least['pred']>=4)[0],:]
    return len(np.where(j['rating']>=4)[0])*1.0/pred_true.shape[0]

In [26]:
x=range(20,105,5)
map(precision_least,x)

[0.7999316005471956,
 0.8362531283518055,
 0.8596104373392135,
 0.887329286798179,
 0.9158183241973376,
 0.9341054313099042,
 0.9621554915672563,
 1.0099309153713298,
 1.0622161671207992,
 1.0904428904428904,
 1.1207474844274077,
 1.1619473422752111,
 1.2075374290139391,
 1.2246073298429319,
 1.2481323372465314,
 1.269815418023887,
 1.3045175683212493]

## 9.For better performance, tring to recommend only top N movies to users.

In [22]:
def precision_hot(i):
    REC=df3.sort_values('pred',ascending=False).groupby('user_id').head(i)
    return (REC.sort_values('user_id')['rating']>=4).sum()*1.0/len(REC.index)

In [23]:
x=range(1,16)
map(precision_hot,x)

[0.87142857142857144,
 0.861328125,
 0.8635724331926864,
 0.85365853658536583,
 0.85004686035613874,
 0.84305669679539852,
 0.84370370370370373,
 0.83844580777096112,
 0.83661792752701847,
 0.83263347330533888,
 0.83190883190883191,
 0.8316993464052288,
 0.82854155776267646,
 0.82661290322580649,
 0.82787286063569687]