In [1]:
import pandas as pd
import numpy as np
import datetime
import math

## Load data

In [2]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv('D:/Tsukuba/My Research/recommend/ml-1m/users.csv', sep='::', header=None, names=unames)

rating_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('D:/Tsukuba/My Research/recommend/ml-1m/ratings.csv', sep='::', header=None, names=rating_names)
ratings['timestamp'] = ratings.timestamp.map(datetime.datetime.utcfromtimestamp)

movie_names = ['movie_id', 'title', 'genres']
movies = pd.read_csv('D:/Tsukuba/My Research/recommend/ml-1m/movies.csv', sep='::', header=None, names=movie_names)

  
  """
  if __name__ == '__main__':


In [3]:
#user-movie statistics
n_user = ratings.user_id.nunique()
n_movie = ratings.movie_id.nunique()
print('number of user:', n_user, 'number of movie:', n_movie)

number of user: 6040 number of movie: 3706


In [4]:
rating_data = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
rating_data.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
rating_matrix = rating_data.as_matrix()
rating_user_mean = np.mean(rating_matrix, axis=1)
rating_dimen_reduc = rating_matrix-rating_user_mean.reshape(-1, 1)

In [6]:
sparsity = round(len(ratings) / float(n_user * n_movie), 3)
print('The sparsity between users and movies:', str(sparsity))

The sparsity between users and movies: 0.045


ユーザー－映画のまばらな行列から、4.5％ユーザーと映画のみインタラクティブを持っています。

#### Split train data and test data

In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(rating_data, test_size=0.3)

In [8]:
train_matrix = train_data.as_matrix()
train_user_mean = np.mean(train_matrix, axis=1)
train_dimen_reduc = train_matrix - train_user_mean.reshape(-1, 1)

test_matrix = test_data.as_matrix()
test_user_mean = np.mean(test_matrix, axis=1)
test_dimen_reduc = test_matrix - test_user_mean.reshape(-1, 1)

## Item-based collaborative filtering recommendation

In [9]:
from sklearn.metrics.pairwise import pairwise_distances
item_similarity = pairwise_distances(rating_matrix.T, metric='cosine')
item_based_pred = rating_matrix.dot(item_similarity) / np.array([np.
                                                                 abs(item_similarity).sum(axis=1)])

In [10]:
from sklearn.metrics import mean_squared_error
item_based_train_pred = item_based_pred[train_matrix.nonzero()]
item_train_flatten = train_matrix[train_matrix.nonzero()]
item_RMSE_train = math.sqrt(mean_squared_error(item_based_train_pred, item_train_flatten))
print('RMSE of train data:', item_RMSE_train)

RMSE of train data: 3.61821947652495


In [11]:
item_based_test_pred = item_based_pred[test_matrix.nonzero()]
item_test_flatten = test_matrix[test_matrix.nonzero()]
item_RMSE_test = math.sqrt(mean_squared_error(item_based_test_pred, item_test_flatten))
print('RMSE of train data:', item_RMSE_test)

RMSE of train data: 3.6230731047414024


## User-based collaborative filtering recommendation

In [12]:
user_similarity = pairwise_distances(rating_matrix, metric='cosine')
mean_user_rating = rating_matrix.mean(axis=1)
ratings_diff = rating_matrix - mean_user_rating[:, np.newaxis]
user_based_pred = rating_user_mean[:, np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [13]:
user_based_train_pred = user_based_pred[train_matrix.nonzero()]
user_train_flatten = train_matrix[train_matrix.nonzero()]
user_RMSE_train = math.sqrt(mean_squared_error(user_based_train_pred, user_train_flatten))
print('RMSE of train data:', user_RMSE_train)

RMSE of train data: 3.264710895955358


In [14]:
user_based_test_pred = user_based_pred[test_matrix.nonzero()]
user_test_flatten = test_matrix[test_matrix.nonzero()]
user_RMSE_test = math.sqrt(mean_squared_error(user_based_test_pred, user_test_flatten))
print('RMSE of train data:', user_RMSE_test)

RMSE of train data: 3.2787315897767506


## SVD collaborative filering recommendation
make k=20

In [15]:
from scipy.sparse.linalg import svds
u, s, vt = svds(rating_matrix, k=20)
s_diag = np.diag(s)
svd_prediction = np.dot(np.dot(u, s_diag), vt)
all_user_prediction = svd_prediction + rating_user_mean.reshape(-1, 1)

In [16]:
prediction = pd.DataFrame(all_user_prediction, columns=rating_data.columns)
prediction.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,3.296091,0.751344,0.055482,0.045996,0.185475,-0.22309,-0.004782,0.227162,-0.040384,0.010916,...,0.04616,0.063587,0.099208,0.053815,-0.013648,0.362026,-0.014752,0.040499,0.056695,0.129149
1,1.400678,0.554055,0.231637,0.187057,0.227537,0.867737,0.224025,0.174578,0.314799,1.532033,...,0.083954,0.126731,0.125738,0.170212,0.092777,0.286712,-0.047011,0.106894,0.127939,0.162796
2,1.333436,0.21628,0.156412,-0.015515,0.032155,0.217351,-0.07027,0.066912,0.07608,0.605373,...,0.025174,0.051069,0.065831,0.057821,0.024025,0.28448,-0.073693,0.01231,0.034155,-0.085134
3,0.281878,-0.088827,0.043531,0.084201,0.048487,0.318924,0.017736,0.023997,0.001312,0.104057,...,0.039579,0.023396,0.020582,-0.000513,-0.002436,0.01465,0.070733,0.004227,0.038865,-0.060646
4,1.241467,0.283945,-0.042119,0.31074,-0.060107,1.607036,-0.097399,0.165202,0.110061,0.43371,...,0.252153,0.166115,0.154353,0.134328,0.179793,0.156213,0.661748,0.190263,0.242496,0.3369


#### Print recommendation of certain user

In [17]:
def recommendation(userid, movie, ratings, pred,  n_recommendation):
    user_index = userid - 1
    sorted_pre = prediction.iloc[user_index].sort_values(ascending=False)
    
    get_user = ratings[ratings.user_id == (user_index)]
    full_info = (get_user.merge(movies, how='left', left_on='movie_id', right_on='movie_id').
                 sort_values(['rating'], ascending=False))
    
    recommendation = (movies[~movies['movie_id'].isin(full_info['movie_id'])].
                      merge(pd.DataFrame(sorted_pre).reset_index(), how='left',
                           left_on='movie_id', right_on='movie_id').
                      rename(columns = {user_index: 'Predictions'}).
                      sort_values('Predictions', ascending = False).
                      iloc[:n_recommendation, :-1])
    
    return recommendation

In [18]:
recommendations = recommendation(1000, movies, ratings, prediction, 5)
recommendations.head()

Unnamed: 0,movie_id,title,genres
1063,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1110,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
0,1,Toy Story (1995),Animation|Children's|Comedy
1034,1200,Aliens (1986),Action|Sci-Fi|Thriller|War
394,480,Jurassic Park (1993),Action|Adventure|Sci-Fi


#### View prediction matrix value distribution

In [19]:
pd.Series(np.percentile(all_user_prediction, np.arange(0, 101, 10))).map('{:.2f}'.format)

0     -2.48
1      0.01
2      0.03
3      0.05
4      0.08
5      0.12
6      0.18
7      0.27
8      0.44
9      0.82
10    11.65
dtype: object

#### View train data matrix non-zero value distribution

In [20]:
pd.Series(np.percentile(train_matrix[train_matrix.nonzero()], np.arange(0, 101, 10))).map('{:.2f}'.format)

0     1.00
1     2.00
2     3.00
3     3.00
4     3.00
5     4.00
6     4.00
7     4.00
8     5.00
9     5.00
10    5.00
dtype: object

In [21]:
all_user_prediction[all_user_prediction < 0] = 0
all_user_prediction[all_user_prediction > 5] = 5

#### Predict train data - only take the scored data, and evaluate

In [22]:
prediction_train = all_user_prediction[train_matrix.nonzero()]
train_matrix_pre = train_matrix[train_matrix.nonzero()]
RMSE_train = math.sqrt(mean_squared_error(train_matrix_pre, prediction_train))
print('RMSE of train data:', RMSE_train)

RMSE of train data: 3.2045386013711923


#### Predict test data - only take the scored data, and evaluate

In [24]:
prediction_test = all_user_prediction[test_matrix.nonzero()]
test_matrix_pre = test_matrix[test_matrix.nonzero()]
RMSE_test = math.sqrt(mean_squared_error(test_matrix_pre, prediction_test))
print('RMSE of test data:', RMSE_test)

RMSE of test data: 3.2338381944601924
