In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [3]:
# CSV 파일 읽기
ratings = pd.read_csv('/content/ratings.csv')


In [4]:
# userId', 'movieId', 'rating' 열 선택
ratings = ratings[['userId', 'movieId', 'rating']]


In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
# 평점 행렬 생성
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
  # 피벗 테이블로 변환 각 행이 사용자를, 각 열이 영화를 나타내고, 각 셀의 값이 평점이 되도록 변환
ratings_matrix.fillna(0, inplace=True)  # 평가되지 않은 항목은 0으로 채움


In [7]:
ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 입력 데이터 정규화 (0과 1 사이의 값으로)
ratings_matrix = ratings_matrix.values / 5.0


In [12]:
ratings_matrix[0]

array([0.8, 0. , 0.8, ..., 0. , 0. , 0. ])

In [8]:
ratings_matrix.shape[1]

9724

In [17]:
# 오토인코더 모델 생성
input_layer = Input(shape=(ratings_matrix.shape[1],)) # 입력층 # 9724
encoded = Dense(512, activation='relu')(input_layer)  # 인코더 차원 축소 9724 -> 512
encoded = Dense(256, activation='relu')(encoded) # 인코더 차원 축소 512 -> 256 code = latency space (vector)
  # 각 영화 평점 데이터(평점 행렬의 행)를 입력 받아 저차원의 표현(잠재 공간)으로 압축
  # 평점 패턴, 선호도 등 주요 특성 학습

decoded = Dense(512, activation='relu')(encoded)      # 디코더: 차원 확장
output_layer = Dense(ratings_matrix.shape[1], activation='sigmoid')(decoded) # 9724
       # sigmoid 활성화 함수를 사용하여 예측된 평점이 0과 1 사이의 값이 되도록
  # 압축된 저차원 데이터를 원래의 차원으로 복원 -> 사용자의 영화 평점 재구성
  # 복원된 데이터에 사용자가 평가하지 않은 항목들에 대한 예측값 포함 <-- 사용자의 평가 패턴과 선호도 학습(일반적인 패턴, 일반화를 기반으로 한 예측)
                                                                     # + 해당 ID의 다른 영화 평점 정보를 통해 추론된 사용자 선호도
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(lr=0.001), loss='mse')
       # 손실 함수 = 평균 제곱 오차(MSE)
       # 원본 평점과 예측 평점 간의 차이를 최소화하도록 학습 유도



In [14]:
# 훈련 및 검증 데이터 분리
x_train, x_val = train_test_split(ratings_matrix, test_size=0.2, random_state=42)


In [None]:
# 모델 학습
# 입력 평점을 재구성하는 방법을 학습
autoencoder.fit(x_train, x_train,epochs=50,batch_size=256,validation_data=(x_val, x_val))


In [18]:
# 모델 평가
val_loss = autoencoder.evaluate(x_val, x_val)
print("Validation loss:", val_loss)


Validation loss: 0.24594268202781677


In [19]:
# 평점 예측
# 학습된 모델을 사용하여 사용자 평점 예측
# 사용자가 아직 평가하지 않은 영화에 대한 추천 생성
predicted_ratings = autoencoder.predict(ratings_matrix) * 5.0  # 스케일을 원래대로 복구




In [None]:
predicted_ratings[1]

In [20]:
# 예측된 평점 확인
print(predicted_ratings)

[[2.506697  2.5162697 2.5039923 ... 2.4893878 2.4757094 2.4618464]
 [2.495845  2.5022173 2.4985962 ... 2.4973748 2.49706   2.4986887]
 [2.5123217 2.5093546 2.50561   ... 2.4930308 2.503399  2.4966176]
 ...
 [2.5264664 2.5030258 2.4581382 ... 2.4809673 2.5006566 2.446776 ]
 [2.5048664 2.500473  2.504087  ... 2.500212  2.4986925 2.4907746]
 [2.4966624 2.5486386 2.4475288 ... 2.596511  2.383711  2.4756725]]


In [27]:
# 추천 생성을 위한 코드 추가
# 사용자가 아직 평가하지 않은 영화에 대한 추천 생성
def generate_recommendations(user_id, predicted_ratings, ratings_matrix):
    user_ratings = ratings_matrix[user_id - 1]  # 사용자의 평점 데이터 가져오기 (인덱스는 0부터 시작하므로 -1)
    unseen_movies_idx = user_ratings == 0  # 사용자가 아직 평가하지 않은 영화 찾기
    predicted_ratings_for_user = predicted_ratings[user_id - 1]  # 사용자에 대한 예측 평점 가져오기
    recommended_movies = predicted_ratings_for_user[unseen_movies_idx]  # 아직 보지 않은 영화에 대한 예측 평점 가져오기
    top_recommended_movie_ids = recommended_movies.argsort()[::-1][:10]  # 상위 10개 추천 영화 선택
                                      # argsort()[::-1]는 평점을 내림차순으로 정렬,
                                      # 가장 높은 평점을 가진 10개의 영화 인덱스 반환

    # print(unseen_movies_idx) # 평점 0 True, 0이 아닌(평점이 있는 영화) index는 False
    return top_recommended_movie_ids

# 예시: 사용자 1에 대한 영화 추천 생성
user_id = 1
recommendations = generate_recommendations(user_id, predicted_ratings, ratings_matrix)
        # user_id: 사용자 ID
        # predicted_ratings: 모델을 통해 생성된 예측 평점
        # ratings_matrix: 사용자-영화 평점 matrix(원 데이터) -평가하지 않은 영화(0값) 정보 포함
print("Recommended movie IDs for user {}: {}".format(user_id, recommendations))

[False  True False ...  True  True  True]
Recommended movie IDs for user 1: [9206 6588 1395 3846 6327 9426 3551 2442 8601 2145]
