## Ref
- https://velog.io/@ie8907/%EC%B6%94%EC%B2%9C-%EC%8B%9C%EC%8A%A4%ED%85%9C-Recommender-System

In [15]:
from collections import defaultdict
from tqdm import tqdm

In [1]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv('./ml-32m/movies.csv')
ratings_df = pd.read_csv('./ml-32m/ratings.csv')

In [2]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [4]:
movies_df.shape, ratings_df.shape

((87585, 3), (32000204, 4))

In [6]:
ratings_df['userId'].nunique()

200948

In [118]:
ratings_dict = {}

for idx, row in tqdm(ratings_df.iterrows()):
    
    user_id = str(int(row['userId']))
    movie_id = str(int(row['movieId']))
    rating = row['rating']
    
    if user_id not in ratings_dict:
        
        ratings_dict[user_id] = {}
        
    ratings_dict[user_id][movie_id] = rating

32000204it [07:04, 75462.41it/s]


In [145]:
from sklearn.metrics.pairwise import cosine_similarity

def user_based_collaborative_filtering(ratings_dict, target_user, k=5):
    # 1. 특정 사용자와 다른 사용자 간의 유사도 계산
    similarities = {}
    target_user_ratings = np.array(list(ratings_dict[target_user].values())).reshape(1, -1)

    for user in tqdm(ratings_dict):
        if user == target_user:
            continue
        
        targets, others = [], []
        if len(set(ratings_dict[user].values())) >= len(set(ratings_dict[target_user].values())):
            for key in ratings_dict[target_user].keys():
                if key in ratings_dict[user]:
                    targets.append(ratings_dict[target_user][key])
                    others.append(ratings_dict[user][key])
        else:
            for key in ratings_dict[user].keys():
                if key in ratings_dict[target_user]:
                    targets.append(ratings_dict[target_user][key])
                    others.append(ratings_dict[user][key])
        
        if len(others) <= 0:
            continue
        
        other_user_ratings = np.array(others).reshape(1, -1)
        target_user_ratings = np.array(targets).reshape(1, -1)
        similarity = cosine_similarity(target_user_ratings, other_user_ratings)[0, 0]
        similarities[user] = similarity
        
        # 기존 코드
        # other_user_ratings = np.array(list(ratings_dict[user].values())).reshape(1, -1)
        # similarity = cosine_similarity(target_user_ratings, other_user_ratings)[0, 0]
        # similarities[user] = similarity

    # 유사도를 기준으로 내림차순 정렬
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    # 상위 k명의 이웃 선택
    top_k_neighbors = sorted_similarities[:k]

    # 평가하지 않은 영화 추출
    not_rated_movies = set(movies_df['movieId'].astype('str')) - set(ratings_dict[target_user].keys())

    # 2. 추천 영화 평점 예측
    movie_recommendations = {}
    for movie_id in not_rated_movies:
        weighted_sum = 0
        similarity_sum = 0

        for neighbor, similarity in top_k_neighbors:
            if movie_id in ratings_dict[neighbor]:
                weighted_sum += ratings_dict[neighbor][movie_id] * similarity
                similarity_sum += similarity
        
        if similarity_sum > 0:
            predicted_rating = weighted_sum / similarity_sum
            movie_recommendations[movie_id] = predicted_rating

    # 예측 평점을 기준으로 상위 영화 선택
    top_movies = sorted(movie_recommendations.items(), key=lambda x: x[1], reverse=True)[:10]

    return top_movies, similarities

In [147]:
# 예시
target_user_id = '2'
recommendations, similarities = user_based_collaborative_filtering(ratings_dict, target_user_id)

result = pd.DataFrame(recommendations, columns=['Movie ID', 'Predicted Rating'])
print(result)

100%|█████████████████████████████████| 200948/200948 [00:23<00:00, 8379.46it/s]


  Movie ID  Predicted Rating
0    33615               5.0
1     6889               5.0
2   133219               5.0
3    71264               5.0
4     2571               5.0
5    30707               5.0
6    62999               5.0
7     1566               5.0
8     2987               5.0
9      745               5.0
