In [1]:
import pandas as pd

In [2]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./data/ml-1m/users.dat', sep='::', header=None,
names=unames)

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./data/ml-1m/ratings.dat', sep='::', header=None,
names=rnames)

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./data/ml-1m/movies.dat', sep='::', header=None,
names=mnames)

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
users[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
ratings[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
movies[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
data = pd.merge(pd.merge(ratings, users), movies)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
user_id       1000209 non-null int64
movie_id      1000209 non-null int64
rating        1000209 non-null int64
timestamp     1000209 non-null int64
gender        1000209 non-null object
age           1000209 non-null int64
occupation    1000209 non-null int64
zip           1000209 non-null object
title         1000209 non-null object
genres        1000209 non-null object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [10]:
data.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,occupation
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0,29.73831,8.036138
std,1728.413,1096.041,1.117102,12152560.0,11.75198,6.531336
min,1.0,1.0,1.0,956703900.0,1.0,0.0
25%,1506.0,1030.0,3.0,965302600.0,25.0,2.0
50%,3070.0,1835.0,4.0,973018000.0,25.0,7.0
75%,4476.0,2770.0,4.0,975220900.0,35.0,14.0
max,6040.0,3952.0,5.0,1046455000.0,56.0,20.0


In [None]:
data.corr()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.matshow(data.corr())

In [None]:
data.loc[0]

In [11]:
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [12]:
mean_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [None]:
ratings_by_title = data.groupby('title').size()

In [None]:
ratings_by_title[:10]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [None]:
active_titles

In [None]:
mean_ratings = mean_ratings.loc[active_titles]

In [None]:
mean_ratings.info()

In [None]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)

In [None]:
top_female_ratings[:10]

In [None]:
mean_ratings['diff'] = abs(mean_ratings['M'] - mean_ratings['F'])

In [None]:
sorted_by_diff = mean_ratings.sort_values(by='diff', ascending=False)

In [None]:
sorted_by_diff[:15]

A teraz si podme skusit spravit recommender

In [None]:
n_users = data.user_id.max()
n_items = data.movie_id.max()
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

In [None]:
from sklearn import model_selection as cv
train_data, test_data = cv.train_test_split(data, test_size=0.25)

In [None]:
train_data.info()

In [None]:
train_data[:5]

In [None]:
train_data.describe()

In [None]:
import numpy as np
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))