In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, concatenate, multiply, Input
from tensorflow.keras.optimizers import Adam

In [55]:
#資料前處理
#讀csv檔
df = pd.read_csv('movieRating.csv')

df.head(10)

Unnamed: 0,TrainDataID,UserID,MovieID,Rating
0,1,796,1193,5
1,2,796,661,3
2,3,796,914,3
3,4,796,3408,4
4,5,796,2355,5
5,6,796,1197,3
6,7,796,1287,5
7,8,796,2804,5
8,9,796,919,4
9,10,796,595,5


In [57]:
# 丟棄TrainDataID
df = df.drop(['TrainDataID'], axis=1)

In [59]:
# Suffle資料
df = df.sample(frac=1).reset_index(drop=True)

In [61]:
df

Unnamed: 0,UserID,MovieID,Rating
0,668,2097,3
1,3057,1375,2
2,3568,3579,2
3,1910,549,5
4,219,1219,5
...,...,...,...
899868,511,1017,4
899869,3382,1199,5
899870,1628,538,4
899871,4995,590,2


In [63]:
#有多少使用者和電影
n_users, n_movies = max(df.UserID), max(df.MovieID)
print(n_users, n_movies)

6040 3952


In [65]:
#亂數拆成訓練集 (80%) 與訓練集 (20%)
train, test = train_test_split(df, test_size=0.2)

In [67]:
#建立矩陣分解模型
#定義嵌入大小和偏差
dim_embedddings = 30
bias = 1

In [69]:
#電影的嵌入層和偏差層
movie_input = Input(shape=[1],name='Movie')
movie_embedding = Embedding(n_movies+1, dim_embedddings, name="Movie-Embedding")(movie_input)
movie_bias = Embedding(n_movies+1, bias, name="Movie-Bias")(movie_input)

In [71]:
#使用者的嵌入層和偏差層
user_input = Input(shape=[1],name='User')
user_embedding = Embedding(n_users+1, dim_embedddings, name="User-Embedding")(user_input)
user_bias = Embedding(n_users + 1, bias, name="User-Bias")(user_input)

In [73]:
#矩陣乘積並進行 Dropout
matrix_product = multiply([movie_embedding, user_embedding])
matrix_product = Dropout(0.2)(matrix_product)

In [75]:
#將矩陣乘積與偏差進行拼接
input_terms = concatenate([matrix_product, user_bias, movie_bias])
input_terms = Flatten()(input_terms)

In [77]:
#添加全連接層
dense_1 = Dense(50, activation="relu", name = "Dense1")(input_terms)
dense_1 = Dropout(0.2)(dense_1)
dense_2 = Dense(20, activation="relu", name = "Dense2")(dense_1)
dense_2 = Dropout(0.2)(dense_2)
result = Dense(1, activation='relu', name='Activation')(dense_2)

In [79]:
#定義模型輸入和輸出
model_mf = Model(inputs=[movie_input, user_input], outputs=result)

In [81]:
#顯示模型摘要
model_mf.summary()

In [83]:
#指定學習率
opt_adam = Adam(learning_rate = 0.002)

In [85]:
#編譯模型
model_mf.compile(optimizer = opt_adam, loss = ['mse'], metrics = ['mean_absolute_error'])

In [88]:
#訓練模型
history_mf = model_mf.fit([train['MovieID'], train['UserID']],
                          train['Rating'],
                          batch_size = 256,
                          validation_split = 0.005,
                          epochs = 10,
                          verbose = 0)

In [90]:
#產出預測結果
#show loss at each epoch
pd.DataFrame(history_mf.history)

Unnamed: 0,loss,mean_absolute_error,val_loss,val_mean_absolute_error
0,0.561271,0.585682,0.84855,0.717256
1,0.554383,0.581934,0.847938,0.718727
2,0.547905,0.578471,0.849066,0.720694
3,0.544224,0.576494,0.844546,0.718088
4,0.539018,0.573562,0.835036,0.720163
5,0.535844,0.571542,0.847561,0.720393
6,0.532514,0.569557,0.84633,0.716258
7,0.52957,0.568141,0.841922,0.721973
8,0.526711,0.566069,0.843209,0.715522
9,0.523389,0.564677,0.845606,0.717748


In [92]:
predictions = model_mf.predict([test['MovieID'], test['UserID']])

# 將預測結果四捨五入為整數
predictions = np.round(predictions)

# 建立包含索引的 DataFrame，排除 Rating 欄位
results = test[['MovieID', 'UserID']].copy()  # 保留需要的欄位和索引
results['Rating'] = predictions.flatten()  # 新增預測結果欄位

# 顯示結果
print(results.head(10))



[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step
        MovieID  UserID  Rating
6206       2458    1978     2.0
537228     2872    2496     4.0
569466     1855    1703     3.0
183961     1608    2638     4.0
476482      912    3155     5.0
800555      778    3955     4.0
435880     1488    4814     3.0
220417     1758    4435     3.0
503703      357    2434     4.0
311766      909    1910     5.0


In [94]:
results.to_csv('predicted_results.csv', index=False)

In [95]:
#計算MAE
accuracy, MAE = model_mf.evaluate([test['MovieID'], test['UserID']], test['Rating'])
print(f'Test Accuracy: {accuracy}')
print(f'Test MAE: {MAE}')

[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 0.8207 - mean_absolute_error: 0.7056
Test Accuracy: 0.8175610303878784
Test MAE: 0.704255998134613
