# 아이템 기반 CF

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

base_src = '../data/drive-download-20240102T142504Z-001'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    u_user_src,
    sep='|',
    names=u_cols,
    encoding='latin-1'
)
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = [
    'movie_id', 'title', 'release date', 'video release date',
    'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animat ion',
    'Children\'s', 'Comedy', 'Crime', 'Documentary ', 'Drama',
    'Fantasy', 'Film- Noir', 'Horror', 'Musical', 'Mystery',
    'Romance ', 'Sci-Fi', 'Thriller', 'War', 'Western'
]
movies = pd.read_csv(
    u_item_src,
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(
    u_data_src,
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)


# 정확도 (RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))


# 모델별 RMSE를 계산하는 함수
def score(model):
    # 테스트 데이터의 user_id와 movie)id 간 pair를 맞춰 튜플형 원소 리스트 데이터 생성
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자 - 영화 pair에 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# 데이터셋 생성
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, stratify=y
)
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
 

In [3]:
rating_matrix_t = np.transpose(rating_matrix)

matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)

item_similarity = pd.DataFrame(item_similarity,
                               index=rating_matrix_t.index,
                               columns=rating_matrix_t.index)

In [4]:
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity.columns:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        
        non_rating_idx = user_rating[user_rating.isnull()].index
        user_rating = user_rating.dropna()
        sim_scores = sim_scores.drop(non_rating_idx)
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating


In [5]:
score(CF_IBCF)

1.0091608086502504