<a href="https://colab.research.google.com/github/HwangHanJae/recommender_system/blob/main/inflearn_recsys/UBCF_VS_IBCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 사용자 기반 C(UBCF)F와 아이템 기반 CF(IBCF)

## 사용자 기반 CF

- 데이터가 풍부한 경우 정확한 추천
- 결과에 대한 위험성이 존재

## 아이템 기반 CF
- 계산이 빠름
- 업데이트에 대한 결과 영향이 적음

데이터 크기가 적고, 사용자에 대한 정보가 있는 경우 사용자 기반 CF가 적절하지만  
데이터의 크기가 크고, 충분한 정보가 없는 경우 아이템 기반 CF가 적절합니다.

# 데이터 읽기

무비렌즈의 유저의 정보(u.user) 읽기

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#베이스 경로 설정
base = '/content/drive/MyDrive/RecoSys/Data'

# u.user 파일 경로 설정
u_user_path = os.path.join(base, 'u.user')

#필요한 컬럼 정의
u_cols = ['user_id','age','sex','occupation','zip_code']

#데이터 읽어오기
users = pd.read_csv(u_user_path, sep='|', names = u_cols, encoding='latin-1')
#users 데이터 프레임에 인덱스(user_id) 지정
users = users.set_index('user_id')

#상위 5개
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


무비렌즈의 영화의 정보(u.item) 읽기

In [2]:
#u.item의 파일 경로 설정
u_item_path = os.path.join(base, 'u.item')

#필요한 컬럼 정의
i_cols = ['movie_id','title','release date','video release date','IMDB URL','unknown','Action',
          'Adventure','Animation','Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy','Film-Noir','Horror','Musical', 'Mystery','Romance','Sci-Fi','Thriller','War','Western']

# 데이터 읽어오기
movies = pd.read_csv(u_item_path, sep='|',names =i_cols, encoding='latin-1')
# movies 데이터 프레임에 인덱스(movie_id) 지정
movies = movies.set_index('movie_id')

#상위 5개
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


무비렌즈의 평점 정보(u.data)읽기

In [3]:
#u.data의 파일경로 지정
u_data_path = os.path.join(base, 'u.data')

#필요한 컬럼 정의
r_cols = ['user_id', 'movie_id','rating','timestamp']

#데이터 읽어오기
ratings = pd.read_csv(u_data_path, sep='\t',names = r_cols, encoding='latin-1')

#상위 5개
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
# 데이터 셋 만들기
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, stratify=y)

#sparse matrix 만들기
ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

# 평가지표


In [7]:
from sklearn.metrics import mean_squared_error
def RMSE(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

In [8]:
def score(model):
  id_pairs = zip(x_test['user_id'], x_test['movie_id'])

  y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])

  y_true = np.array(x_test['rating'])

  return RMSE(y_true, y_pred)

## 아이템 기반 CF 구현

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

rating_matrix_t = np.transpose(ratings_matrix)

matrix_dummy = rating_matrix_t.copy().fillna(0)

item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity,
                               index=rating_matrix_t.index,
                              columns = rating_matrix_t.index)

In [10]:
item_similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1672,1673,1674,1675,1676,1677,1679,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.321559,0.207659,0.341282,0.214615,0.094610,0.455550,0.349909,0.351890,0.200432,...,0.0,0.055395,0.041547,0.0,0.000000,0.000000,0.041547,0.0,0.055395,0.055395
2,0.321559,1.000000,0.200109,0.405963,0.256370,0.058655,0.311750,0.257172,0.191364,0.081870,...,0.0,0.086639,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.086639,0.086639
3,0.207659,0.200109,1.000000,0.290287,0.173950,0.139895,0.302292,0.132430,0.193968,0.148786,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.109985
4,0.341282,0.405963,0.290287,1.000000,0.201705,0.070761,0.352049,0.362468,0.286003,0.196143,...,0.0,0.065079,0.000000,0.0,0.108465,0.108465,0.043386,0.0,0.065079,0.086772
5,0.214615,0.256370,0.173950,0.201705,1.000000,0.054165,0.240808,0.214348,0.188687,0.043389,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.114792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676,0.000000,0.000000,0.000000,0.108465,0.000000,0.000000,0.000000,0.077267,0.068489,0.089522,...,0.0,0.000000,0.000000,0.0,1.000000,1.000000,0.000000,0.0,0.000000,0.000000
1677,0.041547,0.000000,0.000000,0.043386,0.000000,0.000000,0.058272,0.000000,0.085611,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.0,0.000000,0.000000
1681,0.055395,0.086639,0.000000,0.065079,0.000000,0.000000,0.058272,0.096583,0.000000,0.000000,...,0.0,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.000000,0.000000


movie_id(item)간의 유사도가 생성이 되는 것을 볼 수 있습니다.

In [13]:
def IBCF(user_id, movie_id):
  if movie_id in item_similarity.columns:
    #특정 영화의 유사도 값 추출
    sim_scores = item_similarity[movie_id]
    #현재 사용자가 평가한 평점 추출
    user_rating = rating_matrix_t[user_id]
    #현재 사용자가 평가하지 않은 아이템의 인덱스 추출
    none_rating_idx = user_rating[user_rating.isnull()].index
    # NULL 데이터 제거
    user_rating = user_rating.dropna()
    sim_scores =sim_scores.drop(none_rating_idx)
    
    #가중평균
    mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
  else:
    mean_rating = 3.0

  return mean_rating

In [14]:
score(IBCF)

1.0145484705850583