In [5]:
import pandas as pd
import numpy as np

movie_data_set = './dataset/ml-10M100K/movies.dat'

m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_data_set, names=m_cols, sep='::', encoding='latin-1', engine='python')

movies['genre'] = movies.genre.apply(lambda x : x.split("|"))
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [6]:
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']

movie_tag_data_set = './dataset/ml-10M100K/tags.dat'
user_tagged_movies = pd.read_csv(movie_tag_data_set, names=t_cols, sep='::', engine='python')

user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()

user_tagged_movies.head()


Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [7]:
print(f'태그 종류 = {len(user_tagged_movies.tag.unique())}')
print(f'태그 레코드 수 = {len(user_tagged_movies)}')
print(f'태그가 붙어 있는 영화 수 = {len(user_tagged_movies.movie_id.unique())}')

태그 종류 = 15241
태그 레코드 수 = 95580
태그가 붙어 있는 영화 수 = 7601


In [8]:
# tag 영화별 list 형식으로 저장
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag' : list})

# 태그 정보 겨합
movies = movies.merge(movie_tags, on='movie_id', how='left')

movies.head()

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie]
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin..."


### 평갓값 데이터
movielens에서 가져온 데이터의 평가값 데이터 수는 1000만건에 이르기에 이를 이용하면 알고리즘에 따라 몇 시간 며칠이 걸리기도 합니다.

In [9]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

movie_rating = './dataset/ml-10M100K/ratings.dat'
ratings = pd.read_csv(movie_rating, names=r_cols, sep='::', engine='python')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [10]:
# 1000 명으로 줄여서 테스트
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]

# 영화 + 평가 데이터
movielens = ratings.merge(movies, on = 'movie_id')
movielens.head()


Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
1,139,122,3.0,974302621,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
2,149,122,2.5,1112342322,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
3,182,122,3.0,943458784,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
4,215,122,4.5,1102493547,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."


In [11]:
movielens.groupby('user_id').agg({'movie_id' : len}).agg({'movie_id' : ['min', 'max', 'mean', len]})

Unnamed: 0,movie_id
min,20.0
max,1668.0
mean,132.83
len,1000.0


In [12]:
movielens.groupby('movie_id').agg({'user_id' : len}).agg({'user_id' : ['min', 'max', 'mean', len]})


Unnamed: 0,user_id
min,1.0
max,496.0
mean,19.719418
len,6736.0


In [13]:
print(f'평가값 수 = {len(movielens)}')

movielens.groupby('rating').agg({'movie_id' : len})

# rating_result = movielens.groupby('rating').agg({'movie_id' : len})

# todo : hitogram

평가값 수 = 132830


Unnamed: 0_level_0,movie_id
rating,Unnamed: 1_level_1
0.5,851
1.0,4847
1.5,1247
2.0,10292
2.5,3729
3.0,31706
3.5,9661
4.0,39917
4.5,6949
5.0,23631


### 평가 방법
추천 알고리즘의 성능을 측정하는 방법

사용할 데이터를 다음 두 가지로 나눕니다.
- 학습용 데이터
- 평가 테스트용 데이터
  - user가 가장 최근에 평가한 5개 영화의 평갓값을 테스트로 준비합니다.  

In [14]:
movielens['timestamp_rank'] = movielens.groupby('user_id')['timestamp'].rank(ascending = False, method = 'first')

movielens_train = movielens[movielens['timestamp_rank'] > 5] # for training
movielens_test = movielens[movielens['timestamp_rank'] <= 5] # for test

학습용 데이터를 통해서 영화의 평갓값을 얼마나 정확하게 예측할 수 있는지 성능을 파악합니다. 평가하는 지표는 다음과 같습니다.
- 예측값
- 실제 평갓값의 RMSE
 - 예측과 실제 평갓값이 일치할 수록 0에 수렴

In [15]:
from typing import List, Dict
from sklearn.metrics import mean_squared_error

def calc_rmse(self, tru_rating: List[float], pred_rating: List[float]) -> float:
    return np.sqrt(mean_squared_error(true_rating, pred_rating))

### 추천 알고리즘 평가 순위 지표

- Precision@K : 유저에게 K개의 아이템 추천 시 실제 선호하는 아이템의 비율이 얼마나 되는가에 대한 지표
- Recall@K : 유저에게 K개의 아이템 추천 시 선호하는 아이템 그룹 중 몇 개 맞았는지 확인하는지를 나타내는 비율

In [16]:
def cal_recall_at_k(
    true_user2items: Dict[int, List[int]],
    pred_user2items: Dict[int, List[int]],
    k : int
) -> float:
    scores = []

    for user_id in true_user2items.keys():
        r_at_k = _recall_at_k(true_user2items[user_id], pred_user2items[user_id], k)
        
        scores.append(r_at_k)
    return np.mean(scores)


def _recall_at_k(self, true_items: List[int], pred_items: List[int], k : int) -> float:
    if len(true_items) == 0 or k == 0:
        return 0.0
    
    r_at_k = (len(set(true_items) & set(pred_items[:k]))) / len(true_items)

    return r_at_k


def cal_precision_at_k(
    true_user2items: Dict[int, List[int]],
    pred_user2items: Dict[int, List[int]],
    k : int
) -> float:
    scores = []

    for user_id in true_user2items.keys():
        p_at_k = _precision_at_k(true_user2items[user_id], pred_user2items[user_id], k)

        scores.append(p_at_k)
    
    return np.mean(scores)



def _precision_at_k(
    true_user2items: Dict[int, List[int]],
    pred_user2items: Dict[int, List[int]],
    k : int
) -> float:
    if k == 0:
        return 0.0
    p_at_k = (len(set(true_items) & set(pred_items[:k]))) / k
    return p_at_k

## 계산



## 통계 정보 또는 특정 규칙에 기반한 추천

Movielens 데이터 사용 

### 영화에 부여된 평갓값 순 나열

In [18]:
movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating': [np.size, np.mean]})

movie_stats.sort_values(by=('rating', 'mean'), ascending = False).head()

  movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating': [np.size, np.mean]})


Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
movie_id,title,Unnamed: 2_level_2,Unnamed: 3_level_2
4095,Cry Freedom (1987),1,5.0
7227,"Trouble with Angels, The (1966)",1,5.0
27255,"Wind Will Carry Us, The (Bad ma ra khahad bord) (1999)",1,5.0
4453,Michael Jordan to the Max (2000),2,5.0
3415,"Mirror, The (Zerkalo) (1975)",1,5.0


해당 경우는 평가 수가 적어 5점의 평가만이 상위입니다.
평가 신뢰성이 낮기에 임계값을 도입해 일정 이상의 평가 수가 있는 영화로 필터링합니다.

In [20]:
movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating' : [np.size, np.mean]})
atleast_flg = movie_stats['rating']['size'] >= 100

movies_sorted_by_rating = movie_stats[atleast_flg].sort_values(by=('rating', 'mean'), ascending=False)

movies_sorted_by_rating.head()

  movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating' : [np.size, np.mean]})


Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
movie_id,title,Unnamed: 2_level_2,Unnamed: 3_level_2
318,"Shawshank Redemption, The (1994)",424,4.491745
50,"Usual Suspects, The (1995)",334,4.459581
912,Casablanca (1942),163,4.444785
904,Rear Window (1954),129,4.44186
2019,Seven Samurai (Shichinin no samurai) (1954),104,4.408654
