In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time
import random
from tqdm import tqdm
import os

# 데이터 분석 라이브러리
import numpy as np
import pandas as pd

# 케라스 라이브러리
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Multiply, Concatenate, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD

# 데이터 분할 및 인코더
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# 시각화 라이브러
import matplotlib.pyplot as plt

# 성능 평가 라이브러리
from sklearn.metrics import mean_absolute_error, mean_squared_error



In [None]:
# seed 설정
def seed_everything(seed: int=42):
  random.seed(seed)      # random 모듈 사용하는 모든 난수
  np.random.seed(seed)      # NumPy 사용하는 모든 난수
  os.environ['PYTHONASHSEED'] = str(seed)      # os.environ : 파이썬 환경 변수 다루는 딕셔너리, PYTHONASHSEED : 파이썬의 해시 함수 시드 설정하는 환경 변수

my_seed = 42
seed_everything(my_seed)

In [None]:
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [None]:
path = '/content/drive/MyDrive/paper_1/data'

# 데이터 불러오기

In [None]:
df = pd.read_pickle(f'{path}/dataset_1.pkl')
df.head()

Unnamed: 0,user,item,reviewText,rating
0,107401,11,I oredered this for a daughter who is now 33 a...,5.0
1,47153,11,Well I thought since this idem didn't have any...,5.0
2,1446031,11,"Though this game is older, I absolutely love i...",5.0
3,200275,40,"This game is a bit hard to get the hang of, bu...",5.0
4,220353,40,I played it a while but it was alright. The st...,4.0


In [None]:
df = df[['user', 'item','rating']]
df.head()

Unnamed: 0,user,item,rating
0,107401,11,5.0
1,47153,11,5.0
2,1446031,11,5.0
3,200275,40,5.0
4,220353,40,4.0


NCF는 User와 Item의 ID만 사용

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [None]:
df['user'] = user_encoder.fit_transform(df['user'])
df['item'] = item_encoder.fit_transform(df['item'])

In [None]:
user_num = len(df['user'].unique())
item_num = len(df['item'].unique())

In [None]:
print('user num :', user_num, 'item num : ',item_num)

user num : 59833 item num :  47061


In [None]:
df.head()

Unnamed: 0,user,item,rating
0,4248,3,5.0
1,1890,3,5.0
2,56163,3,5.0
3,7842,18,5.0
4,8637,18,4.0


In [None]:
# 데이터 세트 분할
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# 학습 데이터세트
train_users, train_items = train_data[['user']].values, train_data[['item']].values
train_y = train_data['rating'].values

# 테스트 데이터세트
test_users, test_items = test_data[['user']].values, test_data[['item']].values
test_y = test_data['rating'].values

In [None]:
def Modeling(user_num, item_num, output_dim):

    # Input
    user = Input(shape=(1,), dtype='int64')
    item = Input(shape=(1,), dtype='int64')

    # GMF
    # MF User Vector
    gmf_user = Embedding(user_num, output_dim, input_length=1,name='gmf_user')(user)
    gmf_user = Flatten()(gmf_user)

    # MF Item Vector
    gmf_item = Embedding(item_num, output_dim, input_length=1, name='gmf_item')(item)
    gmf_item = Flatten()(gmf_item)

    # MLP
    # MLP User Vector
    mlp_user = Embedding(user_num, output_dim, input_length=1, name='mlp_user')(user)
    mlp_user = Flatten()(mlp_user)

    # MLP Item Vector
    mlp_item = Embedding(item_num, output_dim, input_length=1,name='mlp_item')(item)
    mlp_item = Flatten()(mlp_item)

    # GMF Layer : Element-wise Product
    gmf_mul = Multiply()([gmf_user, gmf_item])

    # MLP Layer 1
    mlp_concat = Concatenate()([mlp_user, mlp_item])

    # MLP Lyaer 2
    dense1 = Dense(32, activation='relu')(mlp_concat)
    dense2 = Dense(16, activation='relu')(dense1)
    dense3 = Dense(8, activation='relu')(dense2)

    # NeuMF Layer
    neumf_concat = Concatenate()([gmf_mul, dense3])

    # Output Layer
    output_layer = Dense(1, name='output_layer')(neumf_concat)

    model = Model(inputs=[user, item], outputs=output_layer)
    return model

In [None]:
model = Modeling(user_num=user_num, item_num=item_num, output_dim=16)



In [None]:
sgd = SGD(learning_rate=0.0001) #lr
model.compile(optimizer=sgd, loss='MSE', metrics=["mse", "mae"])

In [None]:
model.summary()

In [None]:
# Configuration of EarlyStopping
early_stopping = EarlyStopping(
    monitor ='val_loss',
    min_delta = 0.001,
    patience = 5,
    verbose = 1,
    mode = 'min',          #
    restore_best_weights = True
)

In [None]:
# 모델 학습
history = model.fit(
    x=[train_users, train_items],
    y=train_y,
    validation_split=0.125,
    batch_size=128,
    callbacks=[early_stopping],
    epochs=50
)

Epoch 1/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 10.7282 - mae: 2.8776 - mse: 10.7282 - val_loss: 1.4472 - val_mae: 0.9549 - val_mse: 1.4472
Epoch 2/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4536 - mae: 0.9563 - mse: 1.4536 - val_loss: 1.4470 - val_mae: 0.9538 - val_mse: 1.4470
Epoch 3/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4502 - mae: 0.9554 - mse: 1.4502 - val_loss: 1.4469 - val_mae: 0.9546 - val_mse: 1.4469
Epoch 4/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4465 - mae: 0.9547 - mse: 1.4465 - val_loss: 1.4467 - val_mae: 0.9544 - val_mse: 1.4467
Epoch 5/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4543 - mae: 0.9565 - mse: 1.4543 - val_loss: 1.4465 - val_mae: 0.9535 - val_mse: 1.4465
Epoch 6/50
[1m2981/2981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
# 예측
predictions = model.predict([test_users, test_items])

# Calculate MAE
MAE = mean_absolute_error(test_y, predictions)
print(f"MAE: {MAE:.3f}")

# Calculate MSE
MSE = mean_squared_error(test_y, predictions)
print(f"MSE: {MSE:.3f}")

# Calculate RMSE
RMSE = np.sqrt(MSE)
print(f"RMSE: {RMSE:.3f}")

# Calculate MAPE
MAPE = mean_absolute_percentage_error(test_y, predictions) * 100
print(f"MAPE: {MAPE:.3f}%")

[1m3407/3407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
MAE: 0.955
MSE: 1.447
RMSE: 1.203
MAPE: 40.917%
