In [113]:
import pandas as pd
import numpy as np

In [114]:
Movies = pd.read_csv('C:/movielens_movie_titles.csv', sep = '|', nrows=5000, encoding='utf-8')
Ratings = pd.read_csv('C:/movielens_movie_ratings.csv', sep = '|', nrows=5000, encoding='utf-8')  

In [115]:
Movies_Ratings_table = pd.merge(Ratings, Movies, on='movieId')
Movies_Ratings_table

Unnamed: 0,userId,movieId,rating,title
0,1,8844,3.5,Jumanji (1995)
1,5,8844,3.0,Jumanji (1995)
2,13,8844,3.0,Jumanji (1995)
3,29,8844,3.0,Jumanji (1995)
4,34,8844,3.0,Jumanji (1995)
...,...,...,...,...
4144,48,659,4.0,"Tin Drum, The (Blechtrommel, Die) (1979)"
4145,48,10705,4.0,Henry V (1989)
4146,49,10061,2.5,Escape from L.A. (1996)
4147,50,76,5.0,Before Sunrise (1995)


In [116]:
fields = ['movieId', 'title', 'rating']
inputMovies = Movies_Ratings_table[fields].loc[Movies_Ratings_table['userId'] == 1]
inputMovies

Unnamed: 0,movieId,title,rating
0,8844,Jumanji (1995),3.5
5,902,"City of Lost Children, The (Cité des enfants p...",3.5
7,63,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),3.5
23,807,Seven (a.k.a. Se7en) (1995),3.5
40,629,"Usual Suspects, The (1995)",3.5
...,...,...,...
690,1649,Bill & Ted's Bogus Journey (1991),3.5
691,120,"Lord of the Rings: The Fellowship of the Ring,...",5.0
701,6312,"Brotherhood of the Wolf (Pacte des loups, Le) ...",4.0
702,848,Dragonslayer (1981),4.0


In [117]:
userSubset = Ratings[Ratings['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset

Unnamed: 0,userId,movieId,rating
0,1,8844,3.5
1,1,902,3.5
2,1,63,3.5
3,1,807,3.5
4,1,629,3.5
...,...,...,...
4988,50,274,4.0
4994,50,62,5.0
4995,50,583,5.0
4997,50,601,4.0


In [118]:
userSubsetGroup = userSubset.groupby(['userId']) 

In [119]:
#usuario de ejemplo para probar
userSubsetGroup.get_group(50)

Unnamed: 0,userId,movieId,rating
4969,50,63,4.0
4971,50,807,4.0
4974,50,680,5.0
4976,50,278,4.0
4984,50,78,4.0
4986,50,280,4.5
4988,50,274,4.0
4994,50,62,5.0
4995,50,583,5.0
4997,50,601,4.0


In [120]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [121]:
#Diccionario key:userId value:PearsonCoeffficient
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
    
    #Ordenación
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #print(group)
    #print(inputMovies)
    
    #Obtengo el numero total de peliculas similares vistas 
    nRatings = len(group)
    
    #Obtego las puntuaciones de las peliculas que ambos usuarios han valorado 
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    
    #Creo otra lista con los ratings del grupo actual
    tempGroupList = group['rating'].tolist()

    #formula del coeficiente de correlacion de Pearson
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [122]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,1.0,1
1,0.200259,24
2,0.178043,11
3,0.225312,3
4,-0.04773,21


In [123]:
#Obtengo los usuarios que son mas similares al mío, en este caso ignoraré la posición 1 porque será el propio usuario
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[1:10]
topUsers.head()

Unnamed: 0,similarityIndex,userId
43,0.755929,41
37,0.688247,28
39,0.648886,39
13,0.58345,29
22,0.547723,13


In [124]:
topUsersRating = topUsers.merge(Ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.755929,41,31357,2.0,1.511858
1,0.755929,41,11010,5.0,3.779645
2,0.755929,41,8447,4.0,3.023716
3,0.755929,41,649,4.0,3.023716
4,0.755929,41,10997,3.0,2.267787


In [125]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
11,1.645323,7.697856
13,2.89313,11.629981
14,0.755929,3.779645
15,0.755929,3.023716
28,0.528761,2.115042


In [126]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
11,4.678628,11
13,4.019862,13
14,5.0,14
15,4.0,15
28,4.0,28


In [127]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
Movies.loc[Movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title
173,9071,Living in Oblivion (1995)
203,77350,Unzipped (1995)
314,278,"Shawshank Redemption, The (1994)"
367,2788,Reality Bites (1994)
423,1607,"Bronx Tale, A (1993)"
426,6075,Carlito's Way (1993)
660,3065,Mystery Science Theater 3000: The Movie (1996)
1064,601,E.T. the Extra-Terrestrial (1982)
1185,9549,"Right Stuff, The (1983)"
1342,9390,Jerry Maguire (1996)
