# CF 예제 실습

In [2]:
import pandas as pd
import numpy as np
import hashlib
from tqdm import tqdm
from datasets import load_dataset
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [3]:
# 영화 테이블 구성

data = {
    "영화 1": [4,5,None,2,4],
    "영화 2": [None,4,3,3,4],
    "영화 3": [5,None,4,5,4],
    "영화 4": [3,3,5,None,4],
    "영화 5": [4,3,4,2,None],
}

df = pd.DataFrame(data, index=["사용자 1", "사용자 2", "사용자 3", "사용자 4", "사용자 5"])

In [5]:
df

Unnamed: 0,영화 1,영화 2,영화 3,영화 4,영화 5
사용자 1,4.0,,5.0,3.0,4.0
사용자 2,5.0,4.0,,3.0,3.0
사용자 3,,3.0,4.0,5.0,4.0
사용자 4,2.0,3.0,5.0,,2.0
사용자 5,4.0,4.0,4.0,4.0,


In [7]:
# 사용자 유사도 테이블 구성

user_df = df.copy()
user_df = user_df.fillna(0)
user_similarity = cosine_similarity(user_df, user_df)
user_similarity_df = pd.DataFrame(user_similarity, index=user_df.index, columns=user_df.index)
user_similarity_df

Unnamed: 0,사용자 1,사용자 2,사용자 3,사용자 4,사용자 5
사용자 1,1.0,0.657031,0.772727,0.778731,0.738549
사용자 2,0.657031,1.0,0.624981,0.56248,0.781133
사용자 3,0.772727,0.624981,1.0,0.702757,0.738549
사용자 4,0.778731,0.56248,0.702757,1.0,0.771517
사용자 5,0.738549,0.781133,0.738549,0.771517,1.0


In [8]:
# 아이템 유사도 테이블 구성
item_df = np.transpose(df.copy())
item_df = item_df.fillna(0)
item_similarity = cosine_similarity(item_df, item_df)
item_similarity_df = pd.DataFrame(item_similarity, index=item_df.index, columns=item_df.index)
item_similarity_df

Unnamed: 0,영화 1,영화 2,영화 3,영화 4,영화 5
영화 1,1.0,0.7605,0.650408,0.716766,0.668031
영화 2,0.7605,1.0,0.671547,0.791694,0.632456
영화 3,0.650408,0.671547,1.0,0.733225,0.757259
영화 4,0.716766,0.791694,0.733225,1.0,0.795704
영화 5,0.668031,0.632456,0.757259,0.795704,1.0


In [9]:
full_df = df.copy()
for user_id in full_df.index:
    for movie_id in full_df.columns:
        if not np.isnan(full_df[movie_id][user_id]): continue
        
        similarities = user_similarity_df[user_id].copy()
        movie_ratings = full_df[movie_id].copy()
        
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        similarities = similarities.drop(none_rating_idx)
        
        mean_rating = np.dot(similarities, movie_ratings) / similarities.sum()
        full_df[movie_id][user_id] = mean_rating
        
full_df
        
        

Unnamed: 0,영화 1,영화 2,영화 3,영화 4,영화 5
사용자 1,4.0,3.473553,5.0,3.0,4.0
사용자 2,5.0,4.0,4.464465,3.0,3.0
사용자 3,3.725069,3.0,4.0,5.0,4.0
사용자 4,2.0,3.0,5.0,3.773235,2.0
사용자 5,4.0,4.0,4.0,4.0,3.232884


In [10]:
watched_movies = ["영화 4", "영화 3"]
item_similarity_df[watched_movies[0]][~item_similarity_df.index.isin(watched_movies)].sort_values(ascending=False)[:3]

영화 5    0.795704
영화 2    0.791694
영화 1    0.716766
Name: 영화 4, dtype: float64