## 00. 환경설정

In [1]:
import pandas as pd
import numpy  as np
from math import log
from tqdm import tqdm

## 01. TF(TermFrequency)-IDF(InverseDocumentFrequency) 실습

### 토이 데이터를 가지고 직접 TF-IDF를 계산해봅시다

In [2]:
document_list = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
]

#### 전체 문서

In [3]:
N = len(document_list)
N

4

#### 전체 단어에 대한 정보 확인

In [4]:
vocab = list(set([word for document in document_list for word in document.split(" ")]))
vocab.sort()

In [5]:
print("전체 단어:", vocab)
print("전체 단어 갯수:", len(vocab))

전체 단어: ['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']
전체 단어 갯수: 9


### TF-IDF를 구하는 함수

In [6]:
def tf(word, document):
    return document.count(word)

def idf(word):
    df = 0
    for document in document_list:
        df += 1 if word in document else 0
        
    return log(N / (df+1))

def tfidf(word, document):
    return tf(word, document) * idf(word)

#### TF 구하기

In [7]:
result = []
for i in range(N):
    doc_tf = []
    d = document_list[i]
    for j in range(len(vocab)):
        t = vocab[j]
        doc_tf.append(tf(t, d))
        
    result.append(doc_tf)
    
tf_df = pd.DataFrame(result, columns = vocab)
tf_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


#### IDF 구하기
- IDF는 전체 문서에 대해서 단어별로 하나의 값을 가집니다.

In [8]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))
    
idf_df = pd.DataFrame(result, index=vocab, columns=['idf'])
idf_df.T

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
idf,0.693147,0.693147,0.693147,0.287682,0.287682,0.693147,0.287682,0.693147,0.693147


#### TF-IDF 구하기

In [9]:
result = []

for i in range(N):
    doc_tfidf = []
    d = document_list[i]
    
    # 하나의 문서 내에서 개별 단어에 대해서 TF-IDF 값을 구합니다.
    for j in range(len(vocab)):
        t = vocab[j]
        doc_tfidf.append(tfidf(t, d))
        
    result.append(doc_tfidf)
    
tfidf_df = pd.DataFrame(result, columns=vocab)
tfidf_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


## 02. 무비렌즈 데이터로 TF-IDF 기반 추천시스템 구현
- 추천 아이템인 영화에 대해서 TF-IDF 벡터를 만드는 것이 핵심 목표

In [10]:
path = "../data/ml-latest-small/"

- 평점이 존재하는 모든 영화 정보

In [11]:
ratings_df = pd.read_csv(path + 'ratings.csv', encoding='utf-8')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


- 우리가 주로 사용하게 될 추천 아이템인 영화에 대한 정보

In [12]:
movies_df  = pd.read_csv(path + 'movies.csv', encoding='utf-8', index_col = 'movieId')
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


- 개별 영화에 대한 아이템 프로파일을 구축하려면 영화의 정보를 Feature로 사용해야 합니다.
- 영화에 대한 피쳐를 TF-IDF로 만들기 위해서 영화의 정보를 담고 있는 Document와 Word가 필요합니다.

- **`개별 영화를 Document, 영화를 표현하는 장르들을 Word`**로 설정하고 TF-IDF를 구해봅시다.

#### 전체 영화의 개수

In [13]:
N = len(movies_df)
N

9742

#### 전체 단어의 개수(=영화 장르 개수)

In [14]:
genres = list(set([genre for movie in movies_df['genres'].tolist() for genre in movie.split("|")]))
genres.sort()

print("전체 단어:", genres)
print("전체 단어 길이:", len(genres))

전체 단어: ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
전체 단어 길이: 20


- (no genres listed)도 하나의 genre로 보고 하나의 word로 취급합니다

In [15]:
movies_df[movies_df['genres']=='(no genres listed)'].head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
114335,La cravate (1957),(no genres listed)
122888,Ben-hur (2016),(no genres listed)
122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
129250,Superfast! (2015),(no genres listed)
132084,Let It Be Me (1995),(no genres listed)


### 전체 단어에 대한 IDF 구하기

#### DF(Document Frequency): 각 단어가 문서에 등장하는 횟수

In [16]:
# genres를 key로, 기본값이 0인 dictionary 생성
df_dict = dict.fromkeys(genres, 0)

for genre_list in tqdm(movies_df['genres']):
    for genre in genres:
        df_dict[genre] += genre in genre_list.split("|")
        
df_dict

100%|███████████████████████████████████████████████████████████████████████████| 9742/9742 [00:00<00:00, 67162.91it/s]


{'(no genres listed)': 34,
 'Action': 1828,
 'Adventure': 1263,
 'Animation': 611,
 'Children': 664,
 'Comedy': 3756,
 'Crime': 1199,
 'Documentary': 440,
 'Drama': 4361,
 'Fantasy': 779,
 'Film-Noir': 87,
 'Horror': 978,
 'IMAX': 158,
 'Musical': 334,
 'Mystery': 573,
 'Romance': 1596,
 'Sci-Fi': 980,
 'Thriller': 1894,
 'War': 382,
 'Western': 167}

#### 단어별 IDF 구하기

In [17]:
idf_dict = dict.fromkeys(df_dict.keys())
for key, value in df_dict.items():
    idf_dict[key] = log(N / (value+1))
idf_dict

{'(no genres listed)': 5.6288536528770745,
 'Action': 1.6726770659756223,
 'Adventure': 2.0421651396596854,
 'Animation': 2.767469431854162,
 'Children': 2.684414673710634,
 'Comedy': 0.9528256687925191,
 'Crime': 2.0941248785903963,
 'Documentary': 3.095156838919642,
 'Drama': 0.8035157676049136,
 'Fantasy': 2.5249077946828504,
 'Film-Noir': 4.706864899888282,
 'Horror': 2.2976700718359777,
 'IMAX': 4.115297512146256,
 'Musical': 3.3700711825414214,
 'Mystery': 2.8315723180469217,
 'Romance': 1.8083195661514755,
 'Sci-Fi': 2.295629254801125,
 'Thriller': 1.6372275968499612,
 'War': 3.236166725185842,
 'Western': 4.060237734963229}

### 전체 단어에 대한 TF-IDF 구하기

In [18]:
result = []
index_list = []

for movie_id, row in tqdm(movies_df.iterrows()):
    doc_tf = []
    document = row['genres']
    
    for genre in genres:
        doc_tf.append(tf(genre, document) * idf_dict[genre])
        
    result.append(doc_tf)
    index_list.append(movie_id)
    
tfidf_df = pd.DataFrame(result, columns=genres, index=index_list).sort_index()

9742it [00:01, 5246.44it/s]


#### 개별 영화에 대한 TF-IDF 벡터 데이터프레임
- 각 영화(문서)는 단어의 개수만큼의 차원을 가진 벡터로 표현됩니다.

In [19]:
tfidf_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,0.000000,2.042165,2.767469,2.684415,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
2,0.0,0.000000,2.042165,0.000000,2.684415,0.000000,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,1.80832,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,1.672677,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,2.767469,0.000000,0.952826,0.0,0.0,0.000000,2.524908,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.803516,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
193587,0.0,1.672677,0.000000,2.767469,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


### 아이템 유사도 기반 추천
- 아이템끼리의 유사도를 구할 수 있습니다.
- 어떤 영화가 주어졌을 때 그 영화와 cosine 유사도가 가장 높은 영화를 추천합니다.

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index], columns=b.index)
    
    return result_df

#### 예시 : 3개 영화 벡터

In [21]:
a = [[0.2, 0.4, 1.2, 1.5],
     [0.4, 0.7, 0.3, 0.5],
     [0.3, 1.2, 1.0, 1.0]]

# 결과: 유저와 3개의 영화의 유사도 = 추천 스코어
cosine_similarity(a, a)

array([[1.        , 0.74907437, 0.87434505],
       [0.74907437, 1.        , 0.94147267],
       [0.87434505, 0.94147267, 1.        ]])

#### 먼저 영화와 영화 사이의 유사도를 구합니다.

In [22]:
movie_sim_df = cos_sim_matrix(tfidf_df, tfidf_df)
movie_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.821277,0.086580,0.080578,0.185731,0.000000,0.086580,0.657468,0.000000,0.261707,...,0.409432,0.518058,0.141984,0.539452,0.0,0.691516,0.753482,0.000000,0.461676,0.185731
2,0.821277,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.800544,0.000000,0.318658,...,0.000000,0.000000,0.000000,0.000000,0.0,0.359250,0.391443,0.000000,0.000000,0.000000
3,0.086580,0.000000,1.000000,0.930671,0.466160,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.108904,0.000000,0.356361,0.000000,0.0,0.105457,0.114907,0.000000,0.000000,0.466160
4,0.080578,0.000000,0.930671,1.000000,0.433841,0.000000,0.930671,0.000000,0.000000,0.000000,...,0.101354,0.102011,0.567512,0.000000,0.0,0.098145,0.106940,0.365857,0.000000,0.433841
5,0.185731,0.000000,0.466160,0.433841,1.000000,0.000000,0.466160,0.000000,0.000000,0.000000,...,0.233619,0.000000,0.764462,0.000000,0.0,0.226224,0.246496,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.691516,0.359250,0.105457,0.098145,0.226224,0.211509,0.105457,0.000000,0.397135,0.213852,...,0.661569,0.631007,0.172940,0.657066,0.0,1.000000,0.917760,0.000000,0.767757,0.226224
193583,0.753482,0.391443,0.114907,0.106940,0.246496,0.000000,0.114907,0.000000,0.000000,0.000000,...,0.543386,0.687551,0.188437,0.715945,0.0,0.917760,1.000000,0.000000,0.612723,0.246496
193585,0.000000,0.000000,0.000000,0.365857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278828,0.644669,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.461676,0.000000,0.000000,0.000000,0.000000,0.275490,0.000000,0.000000,0.517266,0.278541,...,0.792853,0.821883,0.000000,0.855825,0.0,0.767757,0.612723,0.000000,1.000000,0.000000


#### 아이템 유사도 기반 Top K Recommendation

In [23]:
k = 10

given_movie = 'Black Butler: Book of the Atlantic (2017)'
movie_id = movies_df[movies_df.title == given_movie].index[0]

for index, value in movie_sim_df[movie_id].sort_values(ascending=False)[:k].items():
    recommend_movie_id = index[0]
    print(movies_df.loc[recommend_movie_id]['title'], value)

Black Butler: Book of the Atlantic (2017) 1.0
Superman/Batman: Public Enemies (2009) 0.9740752144041196
Dante's Inferno: An Animated Epic (2010) 0.9740752144041196
Justice League: Doom (2012)  0.9740752144041196
Daddy, I'm A Zombie (2012) 0.9177602335851786
Monkeybone (2001) 0.9177602335851786
Mickey's Once Upon a Christmas (1999) 0.9177602335851786
Anomalisa (2015) 0.9177602335851786
South Park: Imaginationland (2008) 0.9177602335851786
Cool World (1992) 0.9177602335851786


## 03. 유저 유사도 기반 추천시스템 구현
- 데이터를 train/test셋으로 나눕니다.
- 아이템 프로파일인 TF-IDF 벡터를 만듭니다.
- 아이템 벡터를 사용하여 **`유저 프로파일 벡터`**를 만듭니다.
- 유저 프로파일 벡터와 추천 후보 아이템 벡터의 similarity를 계산하여 사용자에게 적합한 Top K 아이템을 추천한다.

- Top K Recommendation의 경우, 추천된 아이템 K에 대해서 사용자가 선호한 아이템이 얼마나 있는지를 평가한다.
- 무비렌즈 데이터를 사용할 경우 사용자가 아이템을 선호한다는 기준이 필요하다. (사용자가 아이템에 내린 절대 평점값만이 존재하기 때문에)
- 평점이 존재하는 영화는 모두 선호한다고 가정

In [24]:
user_id_list = ratings_df['userId'].unique()
movie_id_list = movies_df.index.tolist()
movie_id_set = set(movie_id_list)

In [25]:
len(user_id_list)

610

### Hit-ratio로 성능 평가
- 사용자가 선호도를 표시한 아이템 가운데 한 개만 제거해서 이를 test셋에 넣습니다.
- 남은 데이터를 학습 데이터로 사용하여 추천 모델을 만들고 추천 결과를 생성하여 테스트 데이터에 있는 영화와 일치하는지 평가합니다.

#### 사용자 1명에 대해서 추천을 수행하고 성능을 평가해봅시다.
- 사용할 train/test 데이터는 ratings_df이고, 아이템 프로파일인 tf-idf 벡터는 이미 존재합니다.

In [26]:
user_id = 10
_df = ratings_df[ratings_df.userId == user_id]
_df

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


- 랜덤하게 하나의 영화 제거

In [27]:
drop_indices = np.random.choice(_df.index, 1, replace=False)
_df_train = _df.drop(drop_indices)
_df_train

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


### 유저 벡터 생성
- 유저가 봤던 영화들의 평균이 가장 간단한 유저 벡터를 생성하는 방법

In [28]:
rated_movie_list = _df_train['movieId'].tolist()
user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T
user_profile_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.312875,0.455447,0.298648,0.270373,0.534679,0.195853,0.0,0.410429,0.308802,0.0,0.03306,0.532916,0.218206,0.061113,1.001731,0.082577,0.153122,0.093127,0.0


- train 데이터에 있는 영화 id를 추천 대상에서 제거합니다.
- 이미 선호했던 영화를 제외한 나머지 영화 가운데에서 추천을 수행합니다.

In [29]:
rated_movie_set = set(rated_movie_list)
recommend_movie_list = list(movie_id_set - rated_movie_set)
recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
recommend_movie_tfidf_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.0,0.000000,2.042165,2.767469,2.684415,0.952826,0.000000,0.0,0.000000,2.524908,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
2,0.0,0.000000,2.042165,0.000000,2.684415,0.000000,0.000000,0.0,0.000000,2.524908,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,1.80832,0.0,0.000000,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,1.80832,0.0,0.000000,0.000000,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163809,0.0,0.000000,2.042165,2.767469,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
32743,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,2.29767,0.0,0.0,0.0,0.00000,0.0,1.637228,0.000000,0.0
98279,0.0,0.000000,0.000000,0.000000,0.000000,0.952826,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,0.000000,0.0
65514,0.0,1.672677,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.803516,0.000000,0.0,0.00000,0.0,0.0,0.0,0.00000,0.0,0.000000,3.236167,0.0


- 유저 벡터와 나머지 아이템 벡터간의 유사도를 계산합니다.

In [30]:
top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
top_k_result_df

Unnamed: 0,0
1,0.471176
2,0.370491
3,0.727734
4,0.773519
5,0.342679
...,...
163809,0.378590
32743,0.143418
98279,0.342679
65514,0.198348


#### Top K 추천 결과

In [31]:
k = 10

top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
top_k_list

[2243, 50685, 5051, 113225, 95309, 50792, 58839, 6268, 58806, 3825]

- Test 영화 선호 데이터

In [32]:
_df_test = _df.loc[drop_indices.tolist(), :]
test_movie_id = _df_test['movieId'].values[0]
test_movie_id

5377

In [33]:
print("test 선호 영화: ", test_movie_id)
print("top k 추천: ", top_k_list)
print("hit 여부: ", test_movie_id in top_k_list)

test 선호 영화:  5377
top k 추천:  [2243, 50685, 5051, 113225, 95309, 50792, 58839, 6268, 58806, 3825]
hit 여부:  False


### Simple user vector 기반 추천시스템 구현 (average)

In [None]:
k = 20
hit = 0

for user_id in tqdm(user_id_list):
    
    # 개별 유저 데이터 로드
    _df = ratings_df[ratings_df.userId == user_id]
    
    # train / test 데이터 나누기
    drop_indices = np.random.choice(_df.index, 1, replace=False)
    _df_train = _df.drop(drop_indices)
    
    # user vector를 생성
    rated_movie_list = _df_train['movieId'].tolist()
    user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T
    
    # 이미 유저가 평가를 한 영화는 추천 대상에서 제거
    rated_movie_set = set(rated_movie_list)
    recommend_movie_list = list(movie_id_set - rated_movie_set)
    recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
    
    # 추천 대상 영화들의 item vector와에 user vector의 유사도를 구한 뒤, top k 영화를 뽑는다.
    top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
    top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
    top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
    
    # 유저 u에 대한 평가 (hit 여부)
    _df_test = _df.loc[drop_indices.tolist(), :]
    test_movie_id = _df_test['movieId'].values[0]
    
    hit += test_movie_id in top_k_list
    
hit_ratio = hit / len(user_id_list)

 30%|████████████████████████▏                                                       | 184/610 [00:05<00:12, 33.73it/s]

In [None]:
print('hit_ratio: ', hit_ratio)

### Weighted average user vector 기반 추천시스템 구현 (variant)

In [None]:
k = 20
hit = 0

for user_id in tqdm(user_id_list):
    
    # 개별 유저 데이터 로드
    _df = ratings_df[ratings_df.userId == user_id]
    
    # train / test 데이터 나누기
    drop_indices = np.random.choice(_df.index, 1, replace=False)
    _df_train = _df.drop(drop_indices)
    
    # user vector를 생성
    # 유저가 봤던 영화들의 벡터의 평점을 이용한 가중 평균이 user vector (variant)
    rated_movie_list = _df_train['movieId'].tolist()
    numerator = np.matmul(tfidf_df.loc[rated_movie_list].to_numpy().T, _df_train['rating'].to_numpy())
    denominator = _df_train['rating'].sum()
    user_profile_df = pd.DataFrame([numerator], columns = tfidf_df.columns) / denominator
    
    # 이미 유저가 평가를 한 영화는 추천 대상에서 제거
    rated_movie_set = set(rated_movie_list)
    recommend_movie_list = list(movie_id_set - rated_movie_set)
    recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
    
    # 추천 대상 영화들의 item vector와에 user vector의 유사도를 구한 뒤, top k 영화를 뽑는다.
    top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
    top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
    top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
    
    # 유저 u에 대한 평가 (hit 여부)
    _df_test = _df.loc[drop_indices.tolist(), :]
    test_movie_id = _df_test['movieId'].values[0]
    
    hit += test_movie_id in top_k_list
    
hit_ratio = hit / len(user_id_list)

In [None]:
print('hit_ratio: ', hit_ratio)

## 04. 평점 예측 문제
- 평점 예측 문제의 경우 추천 결과를 생성하지 않고 테스트 데이터의 평점을 직접 예측합니다.
- RMSE, MAE 같은 지표로 추천시스템의 성능을 평가합니다.

In [None]:
movie_sim_df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)

In [None]:
print('train 데이터 개수:', len(train_df))
print('test  데이터 개수:', len(test_df))

In [None]:
test_user_list = test_df.userId.unique()
len(test_user_list)

### RMSE로 추천시스템 성능 평가
- 사용자가 과거에 봤던 아이템을 기반으로 유저 벡터가 만들어지는 원리는 동일합니다.
- 예측 평점을 구하는 방법은 다음과 같습니다.
    1. train/test 데이터로 평점 데이터를 분할합니다.
    2. 사용자 u에 대해서 train 데이터에 있는 영화들(M)과 test 데이터에 있는 예측하려는 영화(m) 사이의 유사도를 구합니다.
    3. m의 평점은 M과 m과의 유사도를 가중치로 사용한 M의 평점들으 가중 평균입니다.

#### 사용자 1명에 대해서 추천을 수행하고 성능 평가
- train/test 데이터는 ratings_df를 사용합니다.
- 아이템 프로파일인 tf-idf 벡터는 이미 위에서 생성

In [None]:
user_id = 10
user_train_df = train_df[train_df.userId == user_id]
user_train_df.index = user_train_df['movieId']
user_train_df = user_train_df.sort_index()
user_train_df

- 유저 10이 평가한 training data안의 115개의 영화를 vector로 펼쳐보면,

In [None]:
user_sim_df = movie_sim_df.loc[user_train_df['movieId']]
user_sim_df

- 사용자가 평가한 115개 영화에 대한 평점

In [None]:
user_rating_df = user_train_df[['rating']]
user_rating_df

#### 유사도를 가중치로 사용하여 예측 평점을 구합니다.

In [None]:
user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
predict_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
prediction_df = pd.DataFrame(predict_rating, index=movie_sim_df.index).reset_index()
prediction_df = prediction_df.rename(columns={'level_0': 'movieId', 0: 'predict_rating'})
prediction_df

#### Test 데이터의 true rating과 비교

In [None]:
result_df = prediction_df.merge(test_df[test_df.userId == user_id], on = 'movieId')
result_df

### RMSE, MAE 성능 평가

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# rmse 구하기
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values)

print("rmse", rmse)
print("mae", mae)

### 전체 유저에 대해서 평점 예측

In [None]:
result_df = pd.DataFrame()

for user_id in tqdm(user_id_list):
        
    user_train_df = train_df[train_df.userId == user_id]
    user_train_df.index = user_train_df['movieId']
    user_train_df = user_train_df.sort_index()
    user_sim_df = movie_sim_df.loc[user_train_df['movieId']]
    
    user_rating_df = user_train_df[['rating']]
    
    user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
    predict_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
    prediction_df = pd.DataFrame(predict_rating, index=movie_sim_df.index).reset_index()
    prediction_df = prediction_df.rename(columns={'level_0': 'movieId', 0: 'predict_rating'})
    
    # test
    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

In [None]:
print(len(test_df))
print(len(result_df))

In [None]:
# rmse
mse = mean_squared_error(y_true = result_df['rating'].values, y_pred = result_df['predict_rating'].values)
rmse = np.sqrt(mse)

# mae
mae = mean_absolute_error(y_true = result_df['rating'].values, y_pred = result_df['predict_rating'].values)

print("mae", mae)
print("rmse", rmse)