# 1. 데이터 읽기

In [1]:
import os
import wget
import zipfile
import pandas as pd
import numpy as np

In [2]:
if 'ml-100k.zip' not in os.listdir():
    wget.download('http://files.grouplens.org/datasets/movielens/ml-100k.zip')
    zipfile.ZipFile('ml-100k.zip', 'r').extractall('')

In [3]:
## 유저 정보
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user',sep='|',names=u_cols)
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [4]:
## 영화 정보
i_cols=['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item',sep='|',names=i_cols,encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
## 영화 평점 정보
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols)
ratings.set_index('user_id',inplace=True)
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


# 2. 인기제품 방식

(단순히 평점을 평균해서 평균값이 높은것을 순서대로 추천)

In [6]:
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
def recom_movie(n_items):
    return movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']

In [7]:
recom_movie(10)

movie_id
1293                                      Star Kid (1997)
1467                 Saint of Fort Washington, The (1993)
1653    Entertaining Angels: The Dorothy Day Story (1996)
814                         Great Day in Harlem, A (1994)
1122                       They Made Me a Criminal (1939)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1189                                   Prefontaine (1997)
1500                            Santa with Muscles (1996)
1536                                 Aiqing wansui (1994)
Name: title, dtype: object

# 3. 추천 시스템의 정확도 측정

```
y_true = ratings.loc[user,'rating']   

    해당 유저가 매긴 모든 영화의 평점

    각 사용자가 평가한 모든 영화의 평점을 y_true에 저장한다.

-------------------------
y_pred = movie_mean[ratings.loc[user,'movie_id']]   

    해당 유저가 매긴 모든 영화의 평균 평점

    해당 사용자가 평가한 영화의 평점평균을 y_pred에 저장한다. 현재 사용하는 추천알고리즘이 best_seller 방식이므로 평점 평균이 
    해당 영화의 예측값이라고 할 수 있다.
```

In [8]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

rmse = []
for user in set(ratings.index):
    y_true = ratings.loc[user,'rating'] ## 해당 유저가 매긴 모든 영화의 평점
    y_pred = movie_mean[ratings.loc[user,'movie_id']] ## 해당 유저가 매긴 모든 영화의 평균 평점
    acc = RMSE(y_true, y_pred)
    rmse.append(acc)
print(f'평균적으로 평점이 {np.mean(rmse):.3f}점 차이가 난다')

평균적으로 평점이 0.996점 차이가 난다


# 4. 사용자 집단별 추천

In [9]:
users = pd.read_csv('ml-100k/u.user',sep='|',names=u_cols)
movies = pd.read_csv('ml-100k/u.item',sep='|',names=i_cols,encoding='latin-1')
ratings = pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols)

ratings = ratings.drop('timestamp',axis=1)
movies = movies[['movie_id','title']]

In [10]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings.user_id

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.25,stratify=y)

In [11]:
def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_pred, y_true)

## user와 movie간의 관계 matrix
rating_matrix = x_train.pivot(index='user_id',columns='movie_id',values='rating')

In [12]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1669,1671,1672,1673,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,3.0,5.0,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [13]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

1.026710045111254

In [14]:
merged_ratings = pd.merge(x_train, users)
users = users.set_index('user_id')

g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

In [15]:
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id,'sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating
score(cf_gender)

1.0347054718495552