In [176]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise import KNNBasic


In [9]:
PATH_MOIVES='./ml-1m/movies.txt'
PATH_RATINGS='./ml-1m/ratings.txt'
PATH_USERS='./ml-1m/users.txt'
data_movie = pd.read_csv(PATH_MOIVES, sep='::', header=None, names=['MovieID','MovieName','genre'],engine='python')
data_ratings = pd.read_csv(PATH_RATINGS, sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'],engine='python')
data_users = pd.read_csv(PATH_USERS, sep='::', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'],engine='python')


In [16]:
##hybrid algorithm, content based similarity and model based cf 
def hybridrs_itembased(user_id,movie_id,n_recs,df,cosine_sim,svd_model):
    ##sort similarity, calculate the 30 most similiar movies
    sim=list(enumerate(cosine_sim[int(movie_id)]))
    sim=sorted(sim,key=lambda x: x[1],reverse=True)
    sim=sim[1:30]
    
    ##get movie metadata
    movie_idx=[i[0] for i in sim]
    movies=df.iloc[movie_idx][['MovieID','MovieName','genre','Rating']]
    
    ##predict the movies ratings generates above, with svd
    movies['est'] = movies.apply(lambda x: svd_model.predict(user_id, x['MovieID'], x['Rating']).est, axis = 1)
        
    ##sort predictions, give recommendation ranked by predict rating
    movies=movies.sort_values('est',ascending=False)
    return movies.head(n_recs)

In [174]:
if __name__=='__main__':
##reading data

    PATH_MOIVES='./ml-1m/movies.txt'
    PATH_RATINGS='./ml-1m/ratings.txt'
    PATH_USERS='./ml-1m/users.txt'
    
    data_movie = pd.read_csv(PATH_MOIVES, sep='::', header=None, names=['MovieID','MovieName','genre'],engine='python')
    data_ratings = pd.read_csv(PATH_RATINGS, sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'],engine='python')
    data_users = pd.read_csv(PATH_USERS, sep='::', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'],engine='python')

    ###merge tables
    data_ratings=data_ratings.merge(data_movie,how='left',on='MovieID').fillna(0)
    
    ###read data
    table=data_ratings.pivot_table(
        columns='MovieID',
        index='UserID',
        values='Rating'
        
    ).fillna(0)
    
    ###compute cos similarity
    cosine_sim=cosine_similarity(table,table)
    cosine_sim = pd.DataFrame(cosine_sim,  index=table.index, columns=table.index)

###generate dataset from data_ratings, 
    reader=Reader()
    data=Dataset.load_from_df(data_ratings[['UserID','MovieID','Rating']],reader)
    
    ##split dataset
    trainset,testset=train_test_split(data, test_size=0.2,random_state=0)

##train model
    svd=SVD(n_factors=20,n_epochs=15,biased=True)
    svd.fit(trainset)

##predict
    test_pred=svd.test(testset)

## get rmse
    accuracy.rmse(test_pred, verbose=True)
##evaluate 
    hybrid_svd=cross_validate(svd, data, measures=['rmse'], cv=10, verbose=True)
## generate recommedationf
    ##example
    u_id=data_ratings['UserID'].values[0]
    m_id=data_ratings['MovieID'].values[0]
    
    recs=5
    rec_pd=hybridrs_itembased(u_id,m_id,recs,data_ratings,cosine_sim,svd)
    print(rec_pd)

RMSE: 0.8831
Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8755  0.8703  0.8763  0.8724  0.8770  0.8714  0.8759  0.8730  0.8772  0.8784  0.8747  0.0026  
Fit time          12.50   12.38   12.50   12.47   12.41   12.43   12.54   12.44   12.39   12.47   12.45   0.05    
Test time         0.54    0.88    0.87    0.52    0.87    0.52    0.88    0.88    0.52    0.91    0.74    0.18    
      MovieID                                  MovieName  \
201       260  Star Wars: Episode IV - A New Hope (1977)   
3557      356                        Forrest Gump (1994)   
5346     1307             When Harry Met Sally... (1989)   
4389      903                             Vertigo (1958)   
3899     1537   Shall We Dance? (Shall We Dansu?) (1996)   

                                genre  Rating       est  
201   Action|Adventure|Fantasy|Sci-Fi       5  4.549004  


In [210]:
data_ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieName,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama


In [211]:
###select genre, then genarate several movie in this genre for users to rate
def generate_movie(prefer_genres,data_movie):
    ##generate movies for rating
    rec_df=pd.DataFrame(columns=['MovieID','MovieName','genre','Ratings'])
    for genre in prefer_genres:
        pre_df=data_movie[data_movie['genre'].str.contains(genre)]
        pre_df=pre_df.sample(n=5,axis=0)
        rec_df=rec_df.append(pre_df)
    return rec_df

    

In [225]:
prefer_genres=['Action','Adventure']
user=generate_movie(prefer_genres,data_movie)
user.loc[:,'Ratings']=5
user

  rec_df=rec_df.append(pre_df)
  rec_df=rec_df.append(pre_df)


Unnamed: 0,MovieID,MovieName,genre,Ratings
1950,2019,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama,5
1408,1432,Metro (1997),Action,5
1563,1604,Money Talks (1997),Action|Comedy,5
647,653,Dragonheart (1996),Action|Adventure|Fantasy,5
1491,1527,"Fifth Element, The (1997)",Action|Sci-Fi,5
588,592,Batman (1989),Action|Adventure|Crime|Drama,5
2468,2537,Beyond the Poseidon Adventure (1979),Adventure,5
2104,2173,"Navigator: A Mediaeval Odyssey, The (1988)",Adventure|Fantasy|Sci-Fi,5
2802,2871,Deliverance (1972),Adventure|Thriller,5
124,126,"NeverEnding Story III, The (1994)",Adventure|Children's|Fantasy,5


In [226]:
rec_pd=hybridrs_itembased(1505,1544,recs,data_ratings,cosine_sim,svd)
rec_pd

Unnamed: 0,MovieID,MovieName,genre,Rating,est
1203,2997,Being John Malkovich (1999),Comedy,4,4.588538
2934,904,Rear Window (1954),Mystery|Thriller,3,4.560063
3850,1266,Unforgiven (1992),Western,3,4.379388
4131,3396,"Muppet Movie, The (1979)",Children's|Comedy,4,4.03097
603,1411,Hamlet (1996),Drama,5,4.013914


### basemodel

In [129]:

###use a random prediction as the base model 

algo=NormalPredictor()

algo.fit(trainset)
test_pred=algo.test(testset)

accuracy.rmse(test_pred, verbose=True)


RMSE: 1.5064


1.5063534244900583

In [131]:
base_model = cross_validate(algo, data, measures=['rmse'], cv=10, verbose=True)

Evaluating RMSE of algorithm NormalPredictor on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.4657  1.5452  1.4776  1.4591  1.4995  1.3951  1.5533  1.4535  1.6671  1.6228  1.5139  0.0790  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


### model based SVD

In [167]:
###test the significance of content based algo

##without content based method, randonly sample 30 movies 
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV


SVD_param_grid = {
    'n_factors':[5,10,15,20,25,30],
    'n_epochs':[5,10,15,20,25,30],
    'biased':[True]
    
}

In [168]:
SVD_gs = GridSearchCV(SVD,SVD_param_grid , measures=['rmse'], cv=gs_cv)
SVD_gs.fit(data)

In [169]:
SVD_based_algo = SVD_gs.best_estimator['rmse']

In [170]:
SVD_gs.best_params['rmse']

{'n_factors': 20, 'n_epochs': 15, 'biased': True}

In [171]:
eval_cv = 10
SVD_best_result_base = cross_validate(SVD_based_algo, data, measures=['RMSE'], cv=eval_cv, verbose=True)

Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.1196  1.0635  1.1660  1.1292  1.1840  1.1158  1.1913  1.1317  1.0823  1.1065  1.1290  0.0394  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


### content based

In [181]:
##grid search
param_grid={
    'k':[10,20,30,40],
    'sim_options':{
        'name':['cosine'],
        'user_based':[True,False]
        
    }
}
cf_gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)

cf_gs.fit(data)
cf_result=cf_gs.cv_results

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [182]:
cf_algo = cf_gs.best_estimator['rmse']

In [195]:
cf_gs.best_params['rmse']

{'k': 40, 'sim_options': {'name': 'cosine', 'user_based': False}}

In [183]:
eval_cv = 10
cf_best = cross_validate(cf_algo, data, measures=['RMSE'], cv=eval_cv, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8868  0.8909  0.8879  0.8936  0.8878  0.8889  

### user based cf

In [230]:
###user based knn
data_ratings=data_ratings.merge(data_users,how='left',on='UserID').fillna(0)
reader=Reader()
data=Dataset.load_from_df(data_ratings[['UserID','MovieID','Rating']],reader)
param_grid={
    'k':[10,20,30],
    'sim_options':{
        'name':['cosine'],
        'user_based':[True]
        
    }
}
cf_gs_u = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)

cf_gs_u.fit(data)
cf_result_u=cf_gs_u.cv_results


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [231]:
cf_algo = cf_gs.best_estimator['rmse']

In [232]:
cf_gs.best_params['rmse']

{'k': 30, 'sim_options': {'name': 'cosine', 'user_based': True}}

In [233]:
eval_cv = 10
cf_best_u = cross_validate(cf_algo, data, measures=['RMSE'], cv=eval_cv, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.9422  0.9427  0.9443  0.9458  0.9400  0.9394  

### T test

In [172]:
##test the significance of base model and biased svd
from scipy.stats import ttest_ind

base = base_model['test_rmse']
svd = SVD_best_result_base['test_rmse']

ttest_ind(base,svd)

Ttest_indResult(statistic=13.073558062794303, pvalue=1.2553110720081454e-10)

In [235]:
##test the significanve of svd only and the hybrid one
svd = SVD_best_result_base['test_rmse']
hybrid = hybrid_svd['test_rmse']

ttest_ind(svd,hybrid)

Ttest_indResult(statistic=19.317976982761163, pvalue=1.75179267479118e-13)

In [186]:
##test the significanve of svd only and the cf
svd = SVD_best_result_base['test_rmse']
cf = cf_best['test_rmse']

ttest_ind(svd,cf)

Ttest_indResult(statistic=18.18257470567873, pvalue=4.955077912190384e-13)

In [187]:
##test the significanve of hybrid and the cf
hybrid = hybrid_svd['test_rmse']
cf = cf_best['test_rmse']

ttest_ind(cf,hybrid)

Ttest_indResult(statistic=13.018565011804215, pvalue=1.3449562671998268e-10)

In [None]:
##user based and hybrid

In [234]:
hybrid = hybrid_svd['test_rmse']
cf = cf_best_u['test_rmse']

ttest_ind(cf,hybrid)

Ttest_indResult(statistic=60.2540746748825, pvalue=3.218158509364976e-22)

In [None]:
##hybrid and baseline

In [236]:
ttest_ind(base,hybrid)

Ttest_indResult(statistic=24.244742610584954, pvalue=3.39314596555365e-15)