In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import scipy.sparse as sp
from scipy.sparse.linalg import svds


In [3]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('/Users/zhaoxiangyu/Desktop/ml-100k/u.data', sep='\t', names=header)

In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)

Number of users = 943 | Number of movies = 1682


In [5]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.20)



In [6]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [7]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [8]:
def non_zero_mean(x):
    return x[x != 0].mean()
def predict(ratings, similarity, type):
    if type == 'user':
        mean_user_rating = np.apply_along_axis(non_zero_mean, axis = 1, arr = ratings)
        #mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [9]:
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [10]:
pred=predict(train_data_matrix, user_similarity, type='user')

In [11]:
pred

array([[ 1.47798447,  0.3110531 ,  0.20422698, ...,  0.01809297,
         0.01784591,  0.01790809],
       [ 1.60461455,  0.40407597,  0.23729094, ...,  0.03065379,
         0.03200895,  0.03210106],
       [ 0.79861114, -0.46185407, -0.61641075, ..., -0.82961005,
        -0.8277208 , -0.8276581 ],
       ..., 
       [ 1.8831035 ,  0.70752451,  0.55159061, ...,  0.35496641,
         0.35575993,  0.35578336],
       [ 2.22231115,  1.00401855,  0.87590369, ...,  0.66578633,
         0.66664395,  0.66698429],
       [ 1.30755958,  0.11242131,  0.01654066, ..., -0.16902753,
        -0.16921396, -0.1690433 ]])

In [12]:
test=np.where(train_data_matrix != 0, 0, pred)
test

array([[ 0.        ,  0.        ,  0.20422698, ...,  0.01809297,
         0.01784591,  0.01790809],
       [ 0.        ,  0.40407597,  0.23729094, ...,  0.03065379,
         0.03200895,  0.03210106],
       [ 0.79861114, -0.46185407, -0.61641075, ..., -0.82961005,
        -0.8277208 , -0.8276581 ],
       ..., 
       [ 1.8831035 ,  0.70752451,  0.55159061, ...,  0.35496641,
         0.35575993,  0.35578336],
       [ 2.22231115,  1.00401855,  0.87590369, ...,  0.66578633,
         0.66664395,  0.66698429],
       [ 1.30755958,  0.        ,  0.01654066, ..., -0.16902753,
        -0.16921396, -0.1690433 ]])

In [13]:
df2=pd.DataFrame(test)

In [14]:
k=-np.sort(-test[2])[2]

In [15]:
def add_pred(x):
    return df2.iloc[int(x[0]-1),int(x[1]-1)]
pred_col = df.apply(add_pred, axis = 1)

In [16]:
#for index in df.index.values:
#    df.iloc[index, 4]=df2.iloc[df.iloc[index, 0]-1,df.iloc[index, 1]-1]

In [17]:
df["pred"] = pred_col
df.head(20)

Unnamed: 0,user_id,item_id,rating,timestamp,pred
0,196,242,3,881250949,0.0
1,186,302,3,891717742,0.0
2,22,377,1,878887116,0.0
3,244,51,2,880606923,0.0
4,166,346,1,886397596,0.0
5,298,474,4,884182806,0.0
6,115,265,2,881171488,1.041836
7,253,465,5,891628467,0.0
8,305,451,3,886324817,0.0
9,6,86,3,883603013,0.0


In [18]:
df3=df[df['pred']!=0]

In [19]:
REC=df3.sort_values('pred',ascending=False).groupby('user_id').head(3)

In [20]:
(REC.sort_values('user_id')['rating']>=3).sum()

2448

In [21]:
def cor(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.corrcoef(prediction, ground_truth)

In [22]:
user_prediction[test_data_matrix.nonzero()]

array([ 0.20422698,  0.21818191,  0.49462169, ...,  0.04845426,
        0.13707619, -0.08274886])

In [23]:
tttt=REC[REC['pred']>2]

In [55]:
(REC.sort_values('user_id')['rating']>=3).sum()*1.0/len(REC.index)

0.87584973166368518

In [25]:
(tttt.sort_values('user_id')['rating']>=4).sum()*1.0/len(tttt.index)

0.84756097560975607

In [26]:
REC.sort_values('user_id',ascending=True)

Unnamed: 0,user_id,item_id,rating,timestamp,pred
4177,1,174,5,875073198,1.400481
55001,1,79,4,875072865,1.057973
4290,1,56,4,875072716,1.249666
1052,2,50,5,888552084,2.229399
55790,2,100,5,888552084,1.784006
59514,2,127,5,888552084,1.568215
37734,3,302,2,889236939,0.228228
31937,3,258,2,889237026,0.768839
37188,3,181,4,889237482,0.900573
2526,4,357,4,892003525,1.651621


In [52]:
#pd.merge(df,ave_rating,how='left',on='user_id')

In [53]:
#pd.DataFrame(user_id = range(943), rating_average= df.groupby('user_id')['rating'].mean())