In [2]:
import math
import numpy as np
from numpy import linalg as LA
import pandas as pd

In [90]:
movies = pd.read_csv('movielens/movies_w_imgurl.csv')
movies.head()

Unnamed: 0,movieId,imdbId,title,genres,imgurl
0,1,114709,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://images-na.ssl-images-amazon.com/images...
1,2,113497,Jumanji (1995),Adventure|Children|Fantasy,https://images-na.ssl-images-amazon.com/images...
2,3,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,4,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,5,113041,Father of the Bride Part II (1995),Comedy,https://images-na.ssl-images-amazon.com/images...


## 장르 전처리 ("|"로 이루어진 것 제거)

In [91]:
movieGenre = pd.DataFrame(data=movies['genres'].str.split('|').apply(pd.Series, 1).stack(), columns=['genre'])
movieGenre.index = movieGenre.index.droplevel(1)

In [92]:
movieCountdf = pd.DataFrame(movieGenre.groupby("genre")["genre"].count()).rename(columns= {"genre" : "moviecount"})
movieCountdf

Unnamed: 0_level_0,moviecount
genre,Unnamed: 1_level_1
(no genres listed),18
Action,1545
Adventure,1117
Animation,447
Children,583
Comedy,3315
Crime,1100
Documentary,495
Drama,4365
Fantasy,654


# tf-idf Matrix 생성

In [93]:
totalmovie = movies.shape[0]
movieCountdf["idf"] = movieCountdf['moviecount'].apply(lambda x: math.log10(totalmovie/x))
movieCountdf

Unnamed: 0_level_0,moviecount,idf
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),18,2.70496
Action,1545,0.771304
Adventure,1117,0.91218
Animation,447,1.309925
Children,583,1.194564
Comedy,3315,0.439749
Crime,1100,0.91884
Documentary,495,1.265628
Drama,4365,0.320249
Fantasy,654,1.144655


In [94]:
movieGenreWeights = movieGenre.join(movieCountdf['idf'], on='genre')
movieGenreWeights

Unnamed: 0,genre,idf
0,Adventure,0.912180
0,Animation,1.309925
0,Children,1.194564
0,Comedy,0.439749
0,Fantasy,1.144655
...,...,...
9121,Fantasy,1.144655
9121,Sci-Fi,1.061508
9122,Documentary,1.265628
9123,Comedy,0.439749


In [110]:
movieWeight=movies[["movieId"]]

for g in movieCountdf.index :
    movieGenreIdf=movieGenreWeights[movieGenreWeights["genre"]==g][['idf']]
    movieGenreIdf = movieGenreIdf.rename(columns={'idf':g})
    movieWeight=movieWeight.join(movieGenreIdf)#인덱스로 묶는다

movieWeight.fillna(0,inplace=True)

In [111]:
movieWeight

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.000000,0.91218,1.309925,1.194564,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,2,0.0,0.000000,0.91218,0.000000,1.194564,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,3,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
3,4,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.320249,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
4,5,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,162672,0.0,0.000000,0.91218,0.000000,0.000000,0.000000,0.0,0.000000,0.320249,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
9121,163056,0.0,0.771304,0.91218,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.061508,0.0,0.0,0.0
9122,163949,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,1.265628,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
9123,164977,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


Normalize movie vector so that similarity can be computed simply by inner product between vectors.


In [127]:
movieNorms = pd.DataFrame(LA.norm(movieWeight.iloc[:,1:].values, ord=2, axis=1), index = movieWeight.index, columns =["norm2"])
movieNorms

Unnamed: 0,norm2
0,2.340636
1,1.889257
2,0.887857
3,0.943848
4,0.439749
...,...
9120,1.236746
9121,1.965710
9122,1.265628
9123,0.439749


In [130]:
normalizedMovieWeights = movieWeight.iloc[:, 1:].divide(movieNorms["norm2"], axis=0)

In [131]:
normalizedMovieWeights

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.00000,0.389715,0.559645,0.510359,0.187876,0.0,0.0,0.000000,0.489036,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.00000,0.482825,0.000000,0.632293,0.000000,0.0,0.0,0.000000,0.605876,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.00000,0.000000,0.000000,0.000000,0.495293,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.868726,0.000000,0.0,0.0,0.0
3,0.0,0.00000,0.000000,0.000000,0.000000,0.465911,0.0,0.0,0.339301,0.000000,0.0,0.0,0.0,0.0,0.0,0.817191,0.000000,0.0,0.0,0.0
4,0.0,0.00000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,0.0,0.00000,0.737564,0.000000,0.000000,0.000000,0.0,0.0,0.258944,0.000000,0.0,0.0,0.0,0.0,0.0,0.623656,0.000000,0.0,0.0,0.0
9121,0.0,0.39238,0.464046,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.582311,0.0,0.0,0.0,0.0,0.0,0.000000,0.540012,0.0,0.0,0.0
9122,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
9123,0.0,0.00000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


# Cosine Matrix 생성

In [134]:
sims = pd.DataFrame(data=np.matmul(normalizedMovieWeights, normalizedMovieWeights.T))
# sims
sims.index = movieWeight['movieId']
sims.columns = movieWeight['movieId']

sims

  sims = pd.DataFrame(data=np.matmul(normalizedMovieWeights, normalizedMovieWeights.T))


movieId,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.807155,0.093054,0.087534,0.187876,0.000000,0.093054,0.642140,0.00000,0.254643,...,0.000000,0.187658,0.000000,0.000000,0.000000,0.287439,0.465617,0.0,0.187876,0.0
2,0.807155,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.795559,0.00000,0.315482,...,0.000000,0.232493,0.000000,0.000000,0.000000,0.356114,0.576861,0.0,0.000000,0.0
3,0.093054,0.000000,1.000000,0.940678,0.495293,0.000000,1.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.634039,0.541786,0.000000,0.0,0.495293,0.0
4,0.087534,0.000000,0.940678,1.000000,0.465911,0.000000,0.940678,0.000000,0.00000,0.000000,...,0.084356,0.000000,0.339301,0.339301,0.596426,0.597506,0.000000,0.0,0.465911,0.0
5,0.187876,0.000000,0.495293,0.465911,1.000000,0.000000,0.495293,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,0.287439,0.356114,0.541786,0.597506,0.000000,0.000000,0.541786,0.447627,0.00000,0.481932,...,0.064378,0.355158,0.258944,0.258944,0.455175,1.000000,0.342264,0.0,0.000000,0.0
163056,0.465617,0.576861,0.000000,0.000000,0.000000,0.216114,0.000000,0.281629,0.39238,0.520001,...,0.000000,0.685812,0.000000,0.000000,0.000000,0.342264,1.000000,0.0,0.000000,0.0
163949,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,1.0
164977,0.187876,0.000000,0.495293,0.465911,1.000000,0.000000,0.495293,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.0


In [143]:
ratings = pd.read_csv('ratings-9_1.csv')

train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

In [147]:
userId = 33

In [149]:
userRatings = train[train['userId'] == userId][['movieId', 'rating']] 

topRatings = userRatings.sort_values(by='rating', ascending=False).head(20)

topRatings

Unnamed: 0,movieId,rating
6313,5673,5.0
6190,1186,5.0
6293,4679,5.0
6225,2502,5.0
6211,1994,5.0
6242,3007,5.0
6280,4483,5.0
6278,4450,5.0
6195,1258,5.0
6265,3911,5.0


In [169]:
#129개의 영화와 다른 영화들과의 유사도
sims.loc[userRatings['movieId'].values, :].T.values

array([[0.18787604, 0.18787604, 0.05646374, ..., 0.        , 0.        ,
        0.08753354],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.49529302, 0.49529302, 0.14885399, ..., 0.        , 0.        ,
        0.94067784],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [1.        , 1.        , 0.30053722, ..., 0.        , 0.        ,
        0.46591116],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ]])

In [157]:
#사용자가 매긴 129개의 영화의 별점
userRatings['rating'].values

array([3., 3., 1., 3., 4., 2., 4., 4., 4., 1., 4., 4., 4., 5., 2., 4., 4.,
       4., 5., 4., 4., 3., 2., 4., 3., 1., 4., 2., 3., 4., 1., 5., 4., 3.,
       2., 2., 4., 2., 2., 4., 4., 3., 4., 5., 1., 4., 4., 2., 1., 3., 4.,
       2., 4., 2., 1., 4., 4., 3., 4., 4., 5., 2., 2., 4., 2., 3., 3., 3.,
       2., 3., 4., 4., 2., 2., 3., 4., 3., 2., 4., 2., 3., 2., 5., 4., 4.,
       2., 4., 3., 1., 4., 4., 4., 3., 2., 2., 5., 2., 5., 2., 2., 4., 3.,
       4., 3., 3., 2., 4., 4., 4., 5., 4., 3., 3., 4., 4., 4., 4., 3., 3.,
       4., 3., 4., 4., 2., 4., 4., 4., 4., 5.])

In [152]:
# 각 영화의 유사도의 합
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values

recSimSums = recSimSums + 1

#각 유사도와 rating의 곱
recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)

recItemRatings = pd.DataFrame(data  = np.divide(recWeightedRatingSums, recSimSums), index=sims.index)

recItemRatings.columns = ['pred']

recItemRatings


Unnamed: 0_level_0,pred
movieId,Unnamed: 1_level_1
1,2.989834
2,2.717214
3,3.213070
4,3.220458
5,3.216593
...,...
162672,3.068339
163056,2.750398
163949,2.672879
164977,3.216593


In [170]:
top30Movies = recItemRatings.sort_values(by='pred', ascending=False).head(30)
top30Movies
# displayMovies(movies, top30Movies.index, top30Movies['pred'].values)

Unnamed: 0_level_0,pred
movieId,Unnamed: 1_level_1
681,3.439725
3427,3.439725
2425,3.439725
4945,3.439725
2280,3.439725
1501,3.439725
1447,3.439725
4279,3.439725
59418,3.439725
108949,3.439725


# MAE, RMSE 계산

In [172]:
def getMAE(real, pred):
    errors = real - pred
    return errors.abs().mean()

def getRMSE(real, pred):
    errors = real - pred
    return math.sqrt(errors.pow(2).mean())

In [173]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])

temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')

mae = getMAE(temp['rating'], temp['pred'])
rmse = getRMSE(temp['rating'], temp['pred'])

print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

MAE : 0.9682
RMSE: 1.1347
