In [None]:
# 성별(F, M)과 나이를 인자로 받으면 movieID 하나 던져줌
# svd 모델을 여러 번 만들지 않도록 refactoring 

In [24]:
import numpy as np
import pandas as pd

import import_ipynb
from cate3_cal_expected_rating import get_unseen_movies, recomm_best_movie_by_svd
from surprise import Reader, Dataset, SVD, accuracy

In [2]:
def createUserMatrix(filename):
    data_frame = pd.read_csv(filename, sep="::", usecols = [0, 1, 2], names = ['userID', 'gender', 'age'], engine = 'python')
    return data_frame

In [3]:
def createRatingMatrix(filename):   
    data_frame = pd.read_csv(filename, sep="::", usecols = [0, 1, 2], names = ['userID', 'movieID', 'rating'], engine = 'python')
    return data_frame

In [4]:
def createMovieMatrix(filename):   
    data_frame = pd.read_csv(filename, sep="::", usecols = [0, 1, 2], names = ['movieID', 'title', 'genres'], engine = 'python')
    return data_frame

In [5]:
user_info = createUserMatrix('../ml-1m/users.dat')
user_ratings = createRatingMatrix('../ml-1m/ratings.dat')
movie_info = createMovieMatrix('../ml-1m/movies.dat')

In [6]:
user_info.groupby(['age','gender']).count()
# target은 age 값이 1인 (1~18) user들

Unnamed: 0_level_0,Unnamed: 1_level_0,userID
age,gender,Unnamed: 2_level_1
1,F,78
1,M,144
18,F,298
18,M,805
25,F,558
25,M,1538
35,F,338
35,M,855
45,F,189
45,M,361


In [7]:
user_ratings.head()

Unnamed: 0,userID,movieID,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [8]:
movie_info.head()

Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# 1 ~ 18세인 아이들의 데이터만 활용 
user_under_18 = user_info[user_info['age'] == 1]

In [10]:
target_matrix = pd.merge(user_under_18, user_ratings, on='userID')

In [11]:
# 성별이 female인 아이들
f_target_matrix = target_matrix[target_matrix['gender'] == 'F']
# 성별이 male인 아이들
m_target_matrix = target_matrix[target_matrix['gender'] == 'M']

In [12]:
f_target_matrix.head()

Unnamed: 0,userID,gender,age,movieID,rating
0,1,F,1,1193,5
1,1,F,1,661,3
2,1,F,1,914,3
3,1,F,1,3408,4
4,1,F,1,2355,5


## 평점 개수가 10 이상인 영화

In [13]:
total_mean = f_target_matrix['rating'].mean()
min_rating_num = 10
max_rating_num = 100

In [14]:
#movieID별로 평점 개수가 몇 개 씩인지
rating_num_by_movieID = f_target_matrix.groupby('movieID')['movieID'].count().sort_values(ascending=False)

In [15]:
movie_list = rating_num_by_movieID[rating_num_by_movieID >= min_rating_num].loc[rating_num_by_movieID <= max_rating_num].index

In [16]:
# 여성인 경우라 가정
rating_per_movie = f_target_matrix[f_target_matrix['movieID'].isin(movie_list)].groupby('movieID')['rating'].agg(['count', 'mean']).sort_values(by = 'mean', ascending=False)
rating_per_movie.columns = ['평점 개수', '평점 평균']
rating_per_movie.head()

Unnamed: 0_level_0,평점 개수,평점 평균
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1
2431,10,4.8
916,11,4.636364
150,11,4.636364
318,13,4.538462
527,15,4.533333


In [17]:
def weighted_rating(x, m=min_rating_num, C=total_mean):
    v = x['평점 개수']
    R = x['평점 평균']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [18]:
rating_per_movie['score'] = rating_per_movie.apply(weighted_rating, axis = 1)
rating_per_movie.head()

Unnamed: 0_level_0,평점 개수,평점 평균,score
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2431,10,4.8,4.208145
916,11,4.636364,4.150615
150,11,4.636364,4.150615
318,13,4.538462,4.137518
527,15,4.533333,4.166516


In [19]:
rating_per_movie = rating_per_movie.sort_values('score', ascending=False)
rating_per_movie.reset_index(level=0, inplace=True)
top5_result = rating_per_movie[:5]

In [20]:
result = pd.merge(top5_result, movie_info, on='movieID')

In [21]:
unseen_movies = get_unseen_movies(user_ratings, result, 1)

In [22]:
reader = Reader()
# load_from_df사용해서 데이터프레임을 데이터셋으로 로드
# 인자에 userid-itemid-ratings 변수들이 포함된 데이터프레임형태로 넣어주면 됨!
data = Dataset.load_from_df(user_ratings[['userID','movieID','rating']],reader=reader)

algo = SVD(n_factors=20)
# 전체 data 대상 학습 진행
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc2857f2af0>

In [23]:
recomm_best_movie_by_svd(algo, 1, unseen_movies)

916