In [1]:
%autosave 150
%matplotlib inline
import pandas as pd
import numpy as np
import math
import matplotlib.pylab as plt

Autosaving every 150 seconds


In [14]:
def load_movielens_100k():
    # Load users
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols)

    # load ratings
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols)

    # Load movies
    m_cols = ['movie_id', 'title', 'release_date']
    movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding='latin-1')

    # Join dataframes
    data = pd.merge(pd.merge(ratings, users), movies)
    data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


    print("The DB has "+ str(data.shape[0]) +" ratings")
    print("The DB has ", data.user_id.nunique()," users")
    print("The DB has ", data.movie_id.nunique(), " movies")
    print(data.head())
    
    return data

def load_movielens_1M():
    # Load users
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv('data/ml-1m/users.dat', sep='::', names=u_cols)

    # Load ratings
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', names=r_cols)

    # Load movies
    m_cols = ['movie_id', 'title', 'release_date']
    movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', names=m_cols, usecols=range(3), encoding='latin-1')

    # Join dataframes
    data = pd.merge(pd.merge(ratings, users), movies)
    data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


    print("The DB has "+ str(data.shape[0]) +" ratings")
    print("The DB has ", data.user_id.nunique()," users")
    print("The DB has ", data.movie_id.nunique(), " movies")
    print(data.head())
    
    return data

# Item based

In [19]:
data = load_movielens_100k()

The DB has 100000 ratings
The DB has  943  users
The DB has  1682  movies
   user_id         title  movie_id  rating release_date sex  age
0      196  Kolya (1996)       242       3  24-Jan-1997   M   49
1      305  Kolya (1996)       242       5  24-Jan-1997   M   23
2        6  Kolya (1996)       242       4  24-Jan-1997   M   42
3      234  Kolya (1996)       242       4  24-Jan-1997   M   60
4       63  Kolya (1996)       242       3  24-Jan-1997   M   31


In [20]:
user_item_matrix = data.pivot(index='user_id', columns='movie_id', values='rating')
item_user_matrix = data.pivot(index='movie_id', columns='user_id', values='rating')

In [21]:
#user_item_matrix = user_item_matrix.fillna(user_item_matrix.mean())
#item_user_matrix = item_user_matrix.fillna(item_user_matrix.mean())

In [22]:
def slow_similarity(user_item_matrix, u1, u2, measure="cosine"):
    def dot(m, u1, u2):
        return m.loc[u1].dot(m.loc[u2])
    
    if measure == 'cosine':
        return dot(user_item_matrix, u1, u2)/(np.sqrt(dot(user_item_matrix, u1, u1)*dot(user_item_matrix, u2, u2)))
    
    elif measure == 'adj_cosine':
        mr1 = user_item_matrix.loc[u1].mean()
        mr2 = user_item_matrix.loc[u2].mean()
        common_movies = user_item_matrix.loc[[u1,u2]].isna().sum(axis=0)==0
        common_matrix = user_item_matrix.T[common_movies].T
        return dot(common_matrix, u1, u2)/(np.sqrt(dot(common_matrix, u1, u1)*dot(common_matrix, u2, u2)))
    else:
        print("Method not implemented")
        
def fast_similarity(user_item_matrix, measure="cosine"):
    if measure == 'cosine':
        m = user_item_matrix.dot(user_item_matrix.T)
        norm = np.array([np.sqrt(np.diagonal(m))])
        return (m/norm/norm.T)
    
    elif measure == 'adj_cosine':
        m = user_item_matrix - user_item_matrix.mean()
        m = m.dot(m.T)
        norm = np.array([np.sqrt(np.diagonal(m))])
        return (m/norm/norm.T)

In [23]:
slow_similarity(user_item_matrix.fillna(user_item_matrix.mean()), 10, 11, "cosine")

0.9936685597594792

In [28]:
sim_matrix = fast_similarity(user_item_matrix.fillna(user_item_matrix.mean()), 'cosine')
sim_matrix.loc[10][11]

0.9936685597594788

In [49]:
def content_based_predict(user_id, item_id, df_matrix, cosine_similarities, banned_list=[]):
    if user_id in banned_list:
        return 0
    mean_item = df_matrix[item_id].mean()
    items_user = df_matrix.loc[user_id][df_matrix.loc[user_id] != 0]
    candidate_items = items_user.index.tolist()
    candidate_ratings = items_user.values
    sims = np.array([cosine_similarities[item_id][item_j] for item_j in candidate_items])
    candidate_means = np.array([df_matrix[item_j].mean() for item_j in candidate_items])
    
    if np.sum(sims) <= 0.0001:
        return mean_item
    else:
        return mean_item + np.sum(np.dot(sims, (candidate_ratings-candidate_means)))/np.sum(sims)

In [51]:
content_based_predict(10, 2, user_item_matrix.fillna(0), sim_matrix)

3.985507097355107

# Factorization

In [15]:
data = load_movielens_1M()



The DB has 1000209 ratings
The DB has  6040  users
The DB has  3706  movies
   user_id                                   title  movie_id  rating  \
0        1  One Flew Over the Cuckoo's Nest (1975)      1193       5   
1        2  One Flew Over the Cuckoo's Nest (1975)      1193       5   
2       12  One Flew Over the Cuckoo's Nest (1975)      1193       4   
3       15  One Flew Over the Cuckoo's Nest (1975)      1193       4   
4       17  One Flew Over the Cuckoo's Nest (1975)      1193       5   

  release_date  sex age  
0        Drama    1   F  
1        Drama   56   M  
2        Drama   25   M  
3        Drama   25   M  
4        Drama   50   M  


In [16]:
user_item_matrix = data.pivot(index='user_id', columns='movie_id', values='rating')

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=)
pca.