In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 필요 데이터 로드
path='.\\data\\'
ratings_df = pd.read_csv(path+ 'ratings.csv', encoding='utf-8')
movies_df = pd.read_csv(path+ 'movies.csv', index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(path+ 'tags.csv', encoding='utf-8')

In [3]:
ratings_df.head() # 사용자 아이디, 영화 아이디, 사용자가 해당 영화에 매긴 평점

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_df.head() # 영화 제목 및 장르

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
tags_df.head() #  사용자 아이디, 영화 아이디, 사용자가 해당 영화에 남긴 감상(태그)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


#### Genres를 이용한 movie representation

In [6]:
total_count = len(movies_df.index) #전체 영화 수
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))
# 전체 장르 수

In [7]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Crime', 'Thriller', 'Western', 'Horror', '(no genres listed)', 'Comedy', 'IMAX', 'Sci-Fi', 'Drama', 'Animation', 'Action', 'War', 'Mystery', 'Documentary', 'Film-Noir', 'Adventure', 'Fantasy', 'Children', 'Musical', 'Romance']


In [8]:
print(len(total_genres))

20


In [9]:
# 장르별 영화 수
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre]+1
            
genre_count

{'Crime': 1199,
 'Thriller': 1894,
 'Western': 167,
 'Horror': 978,
 '(no genres listed)': 34,
 'Comedy': 3756,
 'IMAX': 158,
 'Sci-Fi': 980,
 'Drama': 4361,
 'Animation': 611,
 'Action': 1828,
 'War': 382,
 'Mystery': 573,
 'Documentary': 440,
 'Film-Noir': 87,
 'Adventure': 1263,
 'Fantasy': 779,
 'Children': 664,
 'Musical': 334,
 'Romance': 1596}

In [10]:
# 장르별 가중치
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
  

genre_count


{'Crime': 0.9098289421369025,
 'Thriller': 0.7112681505684965,
 'Western': 1.7659316540881678,
 'Horror': 0.9983092704481497,
 '(no genres listed)': 2.457169208193496,
 'Comedy': 0.4139225416416778,
 'IMAX': 1.7899910382813284,
 'Sci-Fi': 0.9974220495432563,
 'Drama': 0.3490620385623247,
 'Animation': 1.2026069149931968,
 'Action': 0.7266719338379385,
 'War': 1.4065847623240424,
 'Mystery': 1.2304935032683613,
 'Documentary': 1.3451954487495636,
 'Film-Noir': 2.0491288726171324,
 'Adventure': 0.8872447746804204,
 'Fantasy': 1.0971106675631865,
 'Children': 1.1664800458677336,
 'Musical': 1.4649016584241867,
 'Romance': 0.7856152382210405}

In [11]:
# 영화 아이디 및 해당 영화가 각 장르에 가지는 가중치를 데이터 프레임으로 만듦
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9742it [00:25, 377.98it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


## Tag를 이용한 Movie Representation

In [12]:
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [13]:
# 영화 데이터 중 하나 가져옴 (제목:warrior, 장르: 드라마)
movies_df.loc[89774]

title     Warrior (2011)
genres             Drama
Name: 89774, dtype: object

In [14]:
# 전체 사용자가 영화별로 단 태그들을 중복을 제거하여 가져옴
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['teenagers', 'heavy metal', 'drugs & music', 'Peace Corp', 'black humor', 'lions', 'MacBeth', 'Up series', 'television', 'anthology', 'Katzanzakis', 'sisterhood', 'Amazing Cinematography', 'Robert Ludlum', 'Tom Hanks', 'seen more than once', 'Amish', 'slow action', 'Documentary', 'Moving', 'intense', 'Big Brothers', 'Angelina Jolie', 'dance', 'uplifting', '1970s', 'singletons', 'priest', 'brothers', 'fugitive', 'cheating', 'unsettling', 'teachers', 'consumerism', 'Mila Kunis', 'parody', 'Europe', 'stone age', 'financial crisis', 'r:strong language', 'England', 'Hal', 'Adrien Brody', 'real estate', 'Zooey Deschanel', 'setting:space/space ship', 'daniel radcliffe', 'seen at the cinema', 'good and evil', 'scandal', 'deafness', 'jackie chan', 'circus', 'McCarthy hearings', 'meditative', 'Julianne Moore', 'amnesia', 'Beatles', 'fantasy', 'Harper Lee', 'heartbreaking', 'mockumentary', 'Surreal', 'gintama', 'Adam Sandler', 'gold', 'books', 'nanny', 'insanity', 'modern fantasy', 'lieutenant d

In [15]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [16]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# 키값이 태그이고, 해당 태그에 속하는 영화의 수를 값으로 갖는 딕셔너리 제작
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'teenagers': 3.196452541703389,
 'heavy metal': 3.196452541703389,
 'drugs & music': 3.196452541703389,
 'Peace Corp': 3.196452541703389,
 'black humor': 3.196452541703389,
 'lions': 3.196452541703389,
 'MacBeth': 3.196452541703389,
 'Up series': 3.196452541703389,
 'television': 2.4183012913197452,
 'anthology': 3.196452541703389,
 'Katzanzakis': 3.196452541703389,
 'sisterhood': 3.196452541703389,
 'Amazing Cinematography': 3.196452541703389,
 'Robert Ludlum': 3.196452541703389,
 'Tom Hanks': 2.5943925503754266,
 'seen more than once': 2.895422546039408,
 'Amish': 2.895422546039408,
 'slow action': 3.196452541703389,
 'Documentary': 3.196452541703389,
 'Moving': 3.196452541703389,
 'intense': 2.5943925503754266,
 'Big Brothers': 3.196452541703389,
 'Angelina Jolie': 3.196452541703389,
 'dance': 2.7193312869837265,
 'uplifting': 3.196452541703389,
 '1970s': 2.7193312869837265,
 'singletons': 3.196452541703389,
 'priest': 2.7193312869837265,
 'brothers': 3.196452541703389,
 'fugitive'

In [17]:

len(tag_idf.keys())

1589

In [20]:
#각 영화가 태그들에 갖는 가중치를 보여주는 데이터 프레임 제작
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)
tag_representation

100%|██████████████████████████████████████████████████████████████████████████████| 1572/1572 [01:55<00:00, 13.66it/s]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [21]:
# 영화 아이디 및 영화 제목 확인
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [22]:
# 영화아이디 1(Toy Story) 는 fun, pixar 라는 태그와 각각에 대해 하단의 가중치를 갖습니다.
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

In [23]:
# 영화아이디 2(Jumanji) 는 Robin Williams, fantasy, game, magic board game이라는 태그와 각각에 대해 하단의 가중치를 갖습니다.
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: object

In [24]:
# 각 데이터 프레임의 shape 확인
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


## Final Movie Represenation
- genre와 tag로 만들어진 representation을 합쳐서 각 movie의 vector로 만든다

In [25]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
print(movie_representation.describe())

(9742, 1609)
       (no genres listed)       Action    Adventure    Animation     Children  \
count         9742.000000  9742.000000  9742.000000  9742.000000  9742.000000   
mean             0.008576     0.136354     0.115027     0.075425     0.079506   
std              0.144915     0.283726     0.298052     0.291593     0.293989   
min              0.000000     0.000000     0.000000     0.000000     0.000000   
25%              0.000000     0.000000     0.000000     0.000000     0.000000   
50%              0.000000     0.000000     0.000000     0.000000     0.000000   
75%              0.000000     0.000000     0.000000     0.000000     0.000000   
max              2.457169     0.726672     0.887245     1.202607     1.166480   

            Comedy        Crime  Documentary        Drama      Fantasy  ...  \
count  9742.000000  9742.000000  9742.000000  9742.000000  9742.000000  ...   
mean      0.159587     0.111978     0.060756     0.156257     0.087728  ...   
std       0.201476  

## Contents 유사도 평가
- Cosine similarity를 사용

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
# 코사인 유사도를 통해 콘텐츠간 유사도를 평가합니다
def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [27]:
print(movie_representation.head())


   (no genres listed)  Action  Adventure  Animation  Children    Comedy  \
1                 0.0     0.0   0.887245   1.202607   1.16648  0.413923   
2                 0.0     0.0   0.887245   0.000000   1.16648  0.000000   
3                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
4                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
5                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   

   Crime  Documentary     Drama   Fantasy  ...  women  wonderwoman  workplace  \
1    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
2    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
3    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   
4    0.0          0.0  0.349062  0.000000  ...    0.0          0.0        0.0   
5    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   

   writing  wrongful imprisonment  wry  younger men  zither  z

In [28]:
cs_df = cos_sim_matrix(movie_representation, movie_representation) # 결과를 데이터 프레임으로 저장합니다.
cs_df.head() # 결과를 위에서 5행까지 출력합니다

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [29]:
cs_df.shape

(9742, 9742)

In [30]:

print(cs_df.shape)
print(cs_df[1].sort_values(ascending=False))

(9742, 9742)
2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [31]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     The Cave of the Golden Rose (1991)
genres            Adventure|Children|Fantasy
Name: 126142, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object


## 추천시스템의 성능 평가
- 학습셋과 테스트셋을 나눈다.
- 테스트셋에서 예측한 평점과 실제 평점의 RMSE를 구한다.


In [32]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=123)

In [33]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [34]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [35]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]  # (n, 9742); n은 userId가 평점을 매긴 영화 수
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    # print("user_id=", i, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|███████████████████████████████████████████████████████████████████████████████| 610/610 [00:04<00:00, 130.28it/s]


In [36]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,6,4.136308,1,4.0,964982224
1,151,4.261184,1,5.0,964984041
2,231,4.232403,1,5.0,964981179
3,423,4.088333,1,3.0,964982363
4,441,4.058008,1,4.0,964980868
5,543,3.688726,1,4.0,964981179
6,804,4.063459,1,4.0,964980499
7,954,2.300271,1,5.0,964983219
8,1029,4.53164,1,5.0,964982855
9,1030,4.482305,1,3.0,964982903


In [37]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.3970670771596811 1.181975920719065
