## 순서
1. 컨텐츠기반 추천시스템
2. 컨텐츠를 활용할 수 있는지
3. 어떻게 컨텐츠들이 서로 연관이 있는지 평가하는 방법
4. 근접이웃기반 컨텐츠기반 추천시스템 -> k-nearest neighbor
5. naive bayes classifer -> 베이즈 추천시스템
6. TF-IDF 개념설명 그리고 실습

- 추천시스템 성능 평가 -> RMSE

# 데이터 불러오기

In [17]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [18]:
ratings_df=pd.read_csv('ratings.csv')
tags_df=pd.read_csv('tags.csv')
movies_df=pd.read_csv("movies.csv")

In [19]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Genres를 이용한 movie representation

In [22]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [23]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Mystery', 'Horror', 'Comedy', 'Animation', 'IMAX', 'War', 'Thriller', 'Documentary', 'Drama', '(no genres listed)', 'Western', 'Adventure', 'Crime', 'Musical', 'Action', 'Fantasy', 'Film-Noir', 'Romance', 'Children', 'Sci-Fi']


In [10]:
print(len(total_genres))

20


In [24]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre]+1
genre_count

{'(no genres listed)': 34,
 'Action': 1828,
 'Adventure': 1263,
 'Animation': 611,
 'Children': 664,
 'Comedy': 3756,
 'Crime': 1199,
 'Documentary': 440,
 'Drama': 4361,
 'Fantasy': 779,
 'Film-Noir': 87,
 'Horror': 978,
 'IMAX': 158,
 'Musical': 334,
 'Mystery': 573,
 'Romance': 1596,
 'Sci-Fi': 980,
 'Thriller': 1894,
 'War': 382,
 'Western': 167}

In [25]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
  
genre_count

{'(no genres listed)': 2.457169208193496,
 'Action': 0.7266719338379385,
 'Adventure': 0.8872447746804204,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336,
 'Comedy': 0.41392254164167785,
 'Crime': 0.9098289421369025,
 'Documentary': 1.3451954487495636,
 'Drama': 0.3490620385623247,
 'Fantasy': 1.0971106675631868,
 'Film-Noir': 2.0491288726171324,
 'Horror': 0.9983092704481497,
 'IMAX': 1.7899910382813284,
 'Musical': 1.4649016584241867,
 'Mystery': 1.2304935032683613,
 'Romance': 0.7856152382210405,
 'Sci-Fi': 0.9974220495432562,
 'Thriller': 0.7112681505684965,
 'War': 1.4065847623240424,
 'Western': 1.7659316540881678}

In [26]:
# create genre representations
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9742it [00:45, 213.41it/s]


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
1,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
2,,,,,,0.413923,,,,,,,,,,0.785615,,,,
3,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
9738,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
9739,,,,,,,,,0.349062,,,,,,,,,,,
9740,,0.726672,,1.202607,,,,,,,,,,,,,,,,


# Tag를 이용한 Movie Representation

In [27]:
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [30]:
movies_df.loc[89774]

In [29]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['Lolita theme', 'multiple stories', 'virginity', 'independent film', 'DC', 'Jean Reno', 'alternate reality', 'terrorism', 'BEST PICTURE', 'Shark', 'gruesome', 'Romans', 'foul language', 'S.E. Hinton', 'Amazing Cinematography', 'revolution', 'Michael Bay', 'Hannibal Lecter', 'Deep Throat', 'Psychological Thriller', 'Africa', 'Visually Striking', 'adorable', 'South America', 'Rosebud', 'imaginative', 'stop looking at me swan', 'Jennifer Connelly', 'von Bulow', 'AWESOME', 'disappointing', 'action choreography', 'stiller', 'younger men', 'dumb', 'cult classic', 'tension building', 'Beethoven', 'sequel', 'pop culture references', 'wine', 'whales', 'Rome', 'Mount Rushmore', 'Charlotte Bronte', 'Venice', 'Bittersweet', 'insanity', 'Harrison Ford', 'bittersweet', 'Cerebral', 'Modern war', 'intimate', 'wedding', 'coen brothers', 'leopard', 'ironic', 'prodigies', 'Aardman', 'mel gibson', 'new composer', 'MacBeth', 'claims to be true', 'matchmaker', 'Atmospheric', 'Johnny Depp', 'surrealism', 'I

In [30]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [31]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key: tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'Lolita theme': 3.196452541703389,
 'multiple stories': 3.196452541703389,
 'virginity': 2.895422546039408,
 'independent film': 3.196452541703389,
 'DC': 3.196452541703389,
 'Jean Reno': 2.895422546039408,
 'alternate reality': 2.7193312869837265,
 'terrorism': 2.4974825373673704,
 'BEST PICTURE': 3.196452541703389,
 'Shark': 3.196452541703389,
 'gruesome': 3.196452541703389,
 'Romans': 3.196452541703389,
 'foul language': 3.196452541703389,
 'S.E. Hinton': 3.196452541703389,
 'Amazing Cinematography': 3.196452541703389,
 'revolution': 3.196452541703389,
 'Michael Bay': 2.895422546039408,
 'Hannibal Lecter': 2.895422546039408,
 'Deep Throat': 3.196452541703389,
 'Psychological Thriller': 3.196452541703389,
 'Africa': 2.4974825373673704,
 'Visually Striking': 3.196452541703389,
 'adorable': 3.196452541703389,
 'South America': 2.895422546039408,
 'Rosebud': 3.196452541703389,
 'imaginative': 3.196452541703389,
 'stop looking at me swan': 3.196452541703389,
 'Jennifer Connelly': 3.1964

In [32]:
len(tag_idf.keys())

1589

In [33]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)
tag_representation

100%|██████████| 1572/1572 [04:04<00:00,  6.43it/s]
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


- 예시: 어떤 영화에 대해서 어떤 tag들이 있는지.

In [34]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

In [36]:
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: object

In [37]:
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


# Final Movie Represenation

- genre와 tag로 만들어진 representation을 합쳐서 각 movie의 vector로 만든다

In [38]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
print(movie_representation.describe())

(10053, 1609)
       (no genres listed)        Action     Adventure     Animation  \
count        10053.000000  10053.000000  10053.000000  10053.000000   
mean             0.008310      0.132135      0.111468      0.073092   
std              0.142663      0.280298      0.294080      0.287344   
min              0.000000      0.000000      0.000000      0.000000   
25%              0.000000      0.000000      0.000000      0.000000   
50%              0.000000      0.000000      0.000000      0.000000   
75%              0.000000      0.000000      0.000000      0.000000   
max              2.457169      0.726672      0.887245      1.202607   

           Children        Comedy         Crime   Documentary         Drama  \
count  10053.000000  10053.000000  10053.000000  10053.000000  10053.000000   
mean       0.077046      0.154650      0.108513      0.058877      0.151423   
std        0.289732      0.200251      0.294893      0.275212      0.173003   
min        0.000000      0.000

# Contents 유사도 평가

- Cosine similarity를 사용


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [40]:
print(movie_representation.head())

   (no genres listed)  Action  Adventure  Animation  Children    Comedy  \
0                 0.0     0.0   0.887245   1.202607   1.16648  0.413923   
1                 0.0     0.0   0.887245   0.000000   1.16648  0.000000   
2                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
3                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
4                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   

   Crime  Documentary     Drama   Fantasy  ...  women  wonderwoman  workplace  \
0    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
1    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
2    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   
3    0.0          0.0  0.349062  0.000000  ...    0.0          0.0        0.0   
4    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   

   writing  wrongful imprisonment  wry  younger men  zither  z

In [41]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10043,10044,10045,10046,10047,10048,10049,10050,10051,10052
0,1.0,0.35465,0.013092,0.016636,0.185686,0.0,0.086555,0.364907,0.0,0.261682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.35465,1.0,0.0,0.0,0.0,0.0,0.0,0.191875,0.0,0.137597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013092,0.0,1.0,0.029072,0.070506,0.0,0.151256,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.016636,0.0,0.029072,1.0,0.089592,0.0,0.192203,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.185686,0.0,0.070506,0.089592,1.0,0.0,0.466135,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
cs_df.shape

(10053, 10053)

In [43]:
print(cs_df.shape)
print(cs_df[1].sort_values(ascending=False))

(10053, 10053)
1         1.000000
122918    0.589114
3574      0.431835
8719      0.431835
1799      0.431835
            ...   
3711      0.000000
3710      0.000000
3709      0.000000
3708      0.000000
193565    0.000000
Name: 1, Length: 10053, dtype: float64


In [44]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

# 추천시스템의 성능 평가


- 학습셋과 테스트셋을 나눈다.
- 테스트셋에서 예측한 평점과 실제 평점의 RMSE를 구한다.

In [45]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [46]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [50]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]  # (n, 9742); n은 userId가 평점을 매긴 영화 수
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    # print("user_id=", i, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

In [48]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1,3.386474,1,4.0,964982703
1,50,3.228582,1,5.0,964982931
2,216,3.570482,1,5.0,964981208
3,223,3.500763,1,3.0,964980985
4,231,4.127854,1,5.0,964981179
5,235,3.925601,1,4.0,964980908
6,316,3.758252,1,3.0,964982310
7,457,3.538283,1,5.0,964981909
8,543,3.600049,1,4.0,964981179
9,592,3.776538,1,4.0,964982271


In [49]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.056850353941454 1.028032272811245
