# 공통 평가 아이템의 수가 일정값 이상인 사용자만 집단 사용자로 고려

In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

base_src = '../data/drive-download-20240102T142504Z-001'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    u_user_src,
    sep='|',
    names=u_cols,
    encoding='latin-1'
)
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = [
    'movie_id', 'title', 'release date', 'video release date',
    'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animat ion',
    'Children\'s', 'Comedy', 'Crime', 'Documentary ', 'Drama',
    'Fantasy', 'Film- Noir', 'Horror', 'Musical', 'Mystery',
    'Romance ', 'Sci-Fi', 'Thriller', 'War', 'Western'
]
movies = pd.read_csv(
    u_item_src,
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(
    u_data_src,
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)


# 정확도 (RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))


# 모델별 RMSE를 계산하는 함수
def score(model, neighbor_size=0):
    # 테스트 데이터의 user_id와 movie)id 간 pair를 맞춰 튜플형 원소 리스트 데이터 생성
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자 - 영화 pair에 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)


# 데이터셋 생성
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, stratify=y
)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

# 코사인 유사도를 구하기 위해 rating값을 복사하고, 계산 시 NaN값 에러 대비 0으로 변환
matrix_dummy = rating_matrix.copy().fillna(0)
# 모든 사용자간 코사인 유사고 계산
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
# 필요한 값 조회를 위해 인덱스 및 컬럼명 지정
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# 사용자 펴악 경향을 고려한 함수
# full matrix에서 각 사용자의 평점평균을 구한다.
rating_mean = rating_matrix.mean(axis=1)
# 사용자 평점 평균의 편차 구하기
rating_bias = (rating_matrix.T - rating_mean).T

In [14]:
rating_binary_1 = np.array(rating_matrix > 0).astype(float)
rating_binary_2 = rating_binary_1.T

# 사용자 간 공통 평가 아이템 수 계산
counts = np.dot(rating_binary_1, rating_binary_2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)
counts

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,12.0,5.0,3.0,50.0,58.0,76.0,16.0,1.0,43.0,...,44.0,8.0,25.0,10.0,18.0,9.0,28.0,7.0,14.0,43.0
2,12.0,47.0,7.0,5.0,4.0,24.0,14.0,5.0,4.0,10.0,...,6.0,9.0,20.0,14.0,18.0,7.0,9.0,5.0,6.0,4.0
3,5.0,7.0,41.0,8.0,1.0,4.0,10.0,7.0,2.0,5.0,...,1.0,1.0,10.0,5.0,6.0,1.0,10.0,3.0,4.0,1.0
4,3.0,5.0,8.0,18.0,2.0,4.0,6.0,5.0,2.0,3.0,...,2.0,1.0,6.0,4.0,4.0,0.0,6.0,2.0,7.0,2.0
5,50.0,4.0,1.0,2.0,131.0,27.0,53.0,13.0,3.0,18.0,...,34.0,3.0,10.0,4.0,11.0,5.0,13.0,4.0,8.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,9.0,7.0,1.0,0.0,5.0,9.0,9.0,3.0,1.0,1.0,...,6.0,12.0,16.0,4.0,17.0,37.0,2.0,3.0,3.0,8.0
940,28.0,9.0,10.0,6.0,13.0,29.0,35.0,9.0,2.0,28.0,...,23.0,4.0,13.0,7.0,9.0,2.0,80.0,5.0,12.0,17.0
941,7.0,5.0,3.0,2.0,4.0,6.0,5.0,5.0,1.0,3.0,...,2.0,3.0,9.0,4.0,6.0,3.0,5.0,17.0,3.0,2.0
942,14.0,6.0,4.0,7.0,8.0,22.0,25.0,8.0,4.0,14.0,...,13.0,4.0,6.0,5.0,7.0,3.0,12.0,3.0,59.0,9.0


In [15]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    if movie_id in  rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        
        no_rating = movie_ratings.isnull()
        common_counts = counts[user_id]
        low_significance = common_counts < SIG_LEVEL
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
            
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    
    if prediction <= 1:
        prediction = 1
    elif prediction > 5:
        prediction = 5
    return prediction

In [16]:
SIG_LEVEL = 3
MIN_RATINGS = 3
score(CF_knn_bias_sig, 30)

0.9468712478482603