In [1]:
path = 'C:/Users/gch05/data_sc/RecommendationSystem/Preprocessing/preprocess/'

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# 영화 데이터 로드
data = pd.read_csv(path+'sample_data.csv')

# 'story' 열을 NumPy 배열로 변환
def convert_to_array(s):
    try:
        return np.fromstring(s[1:-1], sep=' ')
    except:
        return np.nan

data['story'] = data['story'].apply(convert_to_array)
data = data.dropna(subset=['story'])
story_array = np.vstack(data['story'].values)

# 코사인 유사도 행렬 계산
cosine_sim_story = cosine_similarity(story_array, story_array)

# 평점 데이터 로드
ratings_data  = pd.read_csv(path + "../../dataset/ratings_refined.csv", usecols=['userId', 'movieId', 'rating']) # 사용할 컬럼만 적용

# 영화 ID를 인덱스로 매핑
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(data['movieId'])}

# 평점 데이터에서 코사인 유사도 행렬에 있는 영화만 필터링
filtered_ratings = ratings_data[ratings_data['movieId'].isin(movie_id_to_index.keys())]
filtered_ratings['movieIndex'] = filtered_ratings['movieId'].apply(lambda x: movie_id_to_index[x])

# 평점 데이터를 피벗 테이블로 변환
ratings_matrix = filtered_ratings.pivot_table(index='userId', columns='movieIndex', values='rating')
ratings_matrix = ratings_matrix.fillna(0)

# 필터링된 코사인 유사도 행렬
common_movie_indices = [movie_id_to_index[movie_id] for movie_id in filtered_ratings['movieId'].unique()]
filtered_cosine_sim = cosine_sim_story[common_movie_indices, :][:, common_movie_indices]

# # 점수 예측
# predicted_ratings = np.dot(filtered_cosine_sim, ratings_matrix.T)
# predicted_ratings = predicted_ratings.T

# # 결과 출력
# predicted_ratings


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings['movieIndex'] = filtered_ratings['movieId'].apply(lambda x: movie_id_to_index[x])


In [4]:
from tqdm.notebook import tqdm

def modeling_item_story_title(cosine_sim, ratings_data, movie_id_to_index, common_movie_indices, movie_id_to_title):
    # 필터링된 코사인 유사도 행렬을 DataFrame으로 변환
    movie_ids = [movie_id for movie_id, idx in movie_id_to_index.items() if idx in common_movie_indices]
    similarity_matrix = pd.DataFrame(cosine_sim, index=movie_ids, columns=movie_ids)

    # 예측 평점을 저장할 DataFrame 초기화
    df_pred_all = pd.DataFrame()
    all_users = sorted(ratings_data['userId'].unique())

    for user_id in tqdm(all_users):
        user_data = ratings_data[ratings_data['userId'] == user_id]

        # 사용자가 시청한 영화의 유사도 가져오기
        watched_movies = user_data['movieId'].tolist()
        watched_movies_indices = [movie_id for movie_id in watched_movies if movie_id in movie_ids]
        watched_movies_titles = [movie_id_to_title[movie_id] for movie_id in watched_movies_indices]
        sub_sim_mat = similarity_matrix.loc[watched_movies_indices, watched_movies_indices].to_numpy()

        # 유사도 정규화
        sim_N = np.sum(sub_sim_mat, axis=1) + 1

        # 평점 예측
        watched_movie_ratings = user_data['rating'].tolist()
        pred_ratings = np.matmul(sub_sim_mat, watched_movie_ratings) / sim_N

        # 제목을 포함한 결과 DataFrame 생성
        user_list = [user_id] * len(watched_movies_titles)
        cur_pred = pd.DataFrame(list(zip(watched_movies_titles, user_list, pred_ratings)),
                                columns=['title', 'userId', 'pred_rating'])
        df_pred_all = pd.concat([df_pred_all, cur_pred], axis=0)

    return df_pred_all

# movieId에서 title로의 매핑 생성
movie_id_to_title = dict(zip(data['movieId'], data['title']))

# 모델 실행
df_pred_all = modeling_item_story_title(filtered_cosine_sim, filtered_ratings, movie_id_to_index, common_movie_indices, movie_id_to_title)

# 결과의 첫 부분 출력
df_pred_all.head()


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,title,userId,pred_rating
0,Toy Story,1,9.928277
1,Grumpier Old Men,1,-12.99776
2,Seven (a.k.a. Se7en),1,2.618573
3,"Usual Suspects, The",1,2.33716
4,Bottle Rocket,1,4.074539
