In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
path = '/content/drive/MyDrive/종프/data/trained_1/final_merge_outer.csv'

In [4]:
df = pd.read_csv(path)

# 전처리

In [5]:
# 3. 다중 선택 컬럼(Multi-hot 변환)
def multi_label_binarize(df, column, separator=';', prefix=''):
    all_labels = set()
    for entry in df[column].dropna():
        all_labels.update(entry.split(separator))
    all_labels = sorted(all_labels)

    for label in all_labels:
        df[f"{prefix}{label}"] = df[column].apply(
            lambda x: 1 if pd.notna(x) and label in x.split(separator) else 0
        )
    return df

In [6]:
for col, prefix in zip(
    ['TRAVEL_MOTIVES', 'TRAVEL_STYLES', 'TRAVEL_PURPOSE'],
    ['motive_', 'style_', 'purpose_']
):
    df = multi_label_binarize(df, col, separator=';', prefix=prefix)

In [7]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user'] = user_encoder.fit_transform(df['TRAVELER_ID'])
df['item'] = item_encoder.fit_transform(df['VISIT_AREA_TYPE_CD'])

In [8]:
df

Unnamed: 0,TRAVELER_ID,TRAVEL_ID,GENDER,AGE_GRP,TRAVEL_COMPANIONS_NUM,TRAVEL_MOTIVES,TRAVEL_STYLES,VISIT_AREA_NM,VISIT_AREA_TYPE_CD,VISIT_CHC_REASON_CD,...,purpose_28,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,user,item
0,e000003,e_e000003,남,40,3,2;3;6,1;2;3;6,안성휴게소 부산방향,9,6.0,...,0,0,0,1,0,0,0,0,0,8
1,e000003,e_e000003,남,40,3,2;3;6,1;2;3;6,금강휴게소,9,6.0,...,0,0,0,1,0,0,0,0,0,8
2,e000003,e_e000003,남,40,3,2;3;6,1;2;3;6,김천종합운동장,5,11.0,...,0,0,0,1,0,0,0,0,0,4
3,e000003,e_e000003,남,40,3,2;3;6,1;2;3;6,입장 거봉포도 휴게소 서울 방향,9,9.0,...,0,0,0,1,0,0,0,0,0,8
4,e000004,e_e000004,남,40,2,1;3,2;3;4;5,화성 관광열차 안내소 연무대 매표소,2,10.0,...,0,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102952,h006872,h_h006872,여,30,2,1;5;8,3;4;6;7,제주시민복지타운 광장,8,10.0,...,0,1,0,0,0,0,0,1,10238,7
102953,h006874,h_h006874,여,50,0,1;6;7,1;2;3;4;6,꼬스뗀뇨,3,1.0,...,0,0,0,1,1,0,0,0,10239,2
102954,h006874,h_h006874,여,50,0,1;6;7,1;2;3;4;6,마이 피기 팬트리,10,2.0,...,0,0,0,1,1,0,0,0,10239,9
102955,h006874,h_h006874,여,50,0,1;6;7,1;2;3;4;6,월정리 해수욕장,1,11.0,...,0,0,0,1,1,0,0,0,10239,0


In [9]:
drop_cols = ['TRAVEL_ID', 'TRAVEL_MOTIVES', 'TRAVEL_STYLES', 'TRAVEL_PURPOSE', 'VISIT_AREA_TYPE_CD', 'TRAVELER_ID', 'VISIT_AREA_NM']
df = df.drop(columns=drop_cols)

In [10]:
X = df.drop(columns=['DGSTFN'])
y = df['DGSTFN']

In [12]:
df['GENDER'] = df['GENDER'].map({'남': 0, '여': 1})

In [13]:
df.columns

Index(['GENDER', 'AGE_GRP', 'TRAVEL_COMPANIONS_NUM', 'VISIT_CHC_REASON_CD',
       'DGSTFN', 'motive_1', 'motive_10', 'motive_2', 'motive_3', 'motive_4',
       'motive_5', 'motive_6', 'motive_7', 'motive_8', 'motive_9', 'style_1',
       'style_2', 'style_3', 'style_4', 'style_5', 'style_6', 'style_7',
       'purpose_1', 'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13',
       'purpose_2', 'purpose_21', 'purpose_22', 'purpose_23', 'purpose_24',
       'purpose_25', 'purpose_26', 'purpose_27', 'purpose_28', 'purpose_3',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'user', 'item'],
      dtype='object')

In [14]:
df = df[['user', 'GENDER', 'AGE_GRP', 'TRAVEL_COMPANIONS_NUM', 'VISIT_CHC_REASON_CD', 'DGSTFN',
       'motive_1', 'motive_2', 'motive_3', 'motive_4', 'motive_5', 'motive_6', 'motive_7', 'motive_8','motive_9', 'motive_10',
       'style_1', 'style_2', 'style_3', 'style_4', 'style_5', 'style_6', 'style_7',
       'purpose_1', 'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10', 'purpose_11',
       'purpose_12', 'purpose_13', 'purpose_21', 'purpose_22', 'purpose_23', 'purpose_24', 'purpose_25', 'purpose_26', 'purpose_27',
       'purpose_28', 'item']]

In [15]:
df

Unnamed: 0,user,GENDER,AGE_GRP,TRAVEL_COMPANIONS_NUM,VISIT_CHC_REASON_CD,DGSTFN,motive_1,motive_2,motive_3,motive_4,...,purpose_13,purpose_21,purpose_22,purpose_23,purpose_24,purpose_25,purpose_26,purpose_27,purpose_28,item
0,0,0,40,3,6.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
1,0,0,40,3,6.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,40,3,11.0,5.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,40,3,9.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
4,1,0,40,2,10.0,4.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102952,10238,1,30,2,10.0,5.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,7
102953,10239,1,50,0,1.0,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
102954,10239,1,50,0,2.0,4.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,9
102955,10239,1,50,0,11.0,4.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


 # 기본 CF 모델

In [16]:
# 유저 수, 아이템 수 확인
num_users = df['user'].nunique()
num_items = df['item'].nunique()

In [17]:
print(f"최종 user 수: {num_users}, item 수: {num_items}")

최종 user 수: 10240, item 수: 17


In [18]:
# 모델 정의
class MatrixFactorizationModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.item_embedding = layers.Embedding(num_items, embedding_dim)
        self.dot = layers.Dot(axes=1)

    def call(self, inputs):
        user_input, item_input = inputs
        user_vec = self.user_embedding(user_input)
        item_vec = self.item_embedding(item_input)
        output = self.dot([user_vec, item_vec])
        return output

In [19]:
df = df.dropna(subset=['DGSTFN'])

In [20]:
user_input = df['user'].values
item_input = df['item'].values
labels = df['DGSTFN'].values

In [21]:
# 2. 모델 인스턴스 만들기
embedding_dim = 32
cf_model = MatrixFactorizationModel(num_users, num_items, embedding_dim)

# 3. 모델 컴파일
cf_model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

# 4. 학습하기
history = cf_model.fit(
    x=(user_input, item_input),
    y=labels,
    batch_size=256,
    epochs=10,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 18.1840 - mae: 4.1645 - val_loss: 19.8912 - val_mae: 4.3813
Epoch 2/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 3.3852 - mae: 1.4663 - val_loss: 19.9047 - val_mae: 4.3803
Epoch 3/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8449 - mae: 0.6727 - val_loss: 19.9057 - val_mae: 4.3803
Epoch 4/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.6121 - mae: 0.5659 - val_loss: 19.9058 - val_mae: 4.3803
Epoch 5/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.5701 - mae: 0.5421 - val_loss: 19.9057 - val_mae: 4.3803
Epoch 6/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.5587 - mae: 0.5338 - val_loss: 19.9056 - val_mae: 4.3804
Epoch 7/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/

# FEATURE 모두 사용

In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class FeatureTowerModel(tf.keras.Model):
    def __init__(self, num_users, num_items, num_features, embedding_dim=32):
        super().__init__()

        # 1. User, Item 임베딩
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.item_embedding = layers.Embedding(num_items, embedding_dim)

        # 2. 나머지 feature 개별 처리 Dense Layer
        self.feature_dense_layers = [
            layers.Dense(16, activation='relu') for _ in range(num_features)
        ]

        # 3. 마지막 합치기 후 Dense Layer
        self.concat_dense = layers.Dense(64, activation='relu')
        self.output_layer = layers.Dense(1)  # 회귀 문제 (DGSTFN 예측)

    def call(self, inputs):
        user_input, item_input, feature_inputs = inputs

        # 1. user, item 임베딩
        user_vec = self.user_embedding(user_input)
        item_vec = self.item_embedding(item_input)

        # shape 정리 (batch_size, embedding_dim)
        user_vec = tf.squeeze(user_vec, axis=1)
        item_vec = tf.squeeze(item_vec, axis=1)

        # 2. feature별로 따로 처리
        feature_vecs = []
        for i, dense_layer in enumerate(self.feature_dense_layers):
            single_feature = feature_inputs[:, i:i+1]  # (batch_size, 1)
            feature_vec = dense_layer(single_feature) # (batch_size, 16)
            feature_vecs.append(feature_vec)

        # 3. 모든 벡터 합치기
        x = tf.concat([user_vec, item_vec] + feature_vecs, axis=1)

        # 4. 최종 Dense
        x = self.concat_dense(x)
        output = self.output_layer(x)

        return output


In [25]:
df

Unnamed: 0,user,GENDER,AGE_GRP,TRAVEL_COMPANIONS_NUM,VISIT_CHC_REASON_CD,DGSTFN,motive_1,motive_2,motive_3,motive_4,...,purpose_13,purpose_21,purpose_22,purpose_23,purpose_24,purpose_25,purpose_26,purpose_27,purpose_28,item
0,0,0,40,3,6.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
1,0,0,40,3,6.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,40,3,11.0,5.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,40,3,9.0,4.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,8
4,1,0,40,2,10.0,4.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102952,10238,1,30,2,10.0,5.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,7
102953,10239,1,50,0,1.0,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
102954,10239,1,50,0,2.0,4.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,9
102955,10239,1,50,0,11.0,4.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
encoder = OneHotEncoder(sparse_output=False)
age_encoded = encoder.fit_transform(df[['AGE_GRP']])  # 나이대 칼럼 이름

X_feature = df.drop(columns=['DGSTFN', 'user', 'item']).values #'Traveler_ID', 'VISIT_AREA_TYPE_CD',
X_feature = np.hstack([X_feature, age_encoded])
X_feature = np.nan_to_num(X_feature, nan=0.0)

user_input = df['user'].values.reshape(-1, 1)
item_input = df['item'].values.reshape(-1, 1)
labels = df['DGSTFN'].values

num_users = df['user'].nunique()
num_items = df['item'].nunique()
num_features = X_feature.shape[1]

In [46]:
model = FeatureTowerModel(num_users, num_items, num_features, embedding_dim=32)

In [47]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

In [48]:
history = model.fit(
    x=(user_input, item_input, X_feature),
    y=labels,
    batch_size=256,
    epochs=10,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 16ms/step - loss: 3.6314 - mae: 1.2414 - val_loss: 0.7208 - val_mae: 0.7009
Epoch 2/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - loss: 0.5362 - mae: 0.5285 - val_loss: 0.7177 - val_mae: 0.7037
Epoch 3/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.5044 - mae: 0.5123 - val_loss: 0.7503 - val_mae: 0.7210
Epoch 4/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.5041 - mae: 0.5098 - val_loss: 0.7194 - val_mae: 0.7059
Epoch 5/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 0.5001 - mae: 0.5094 - val_loss: 0.7134 - val_mae: 0.7030
Epoch 6/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 0.5047 - mae: 0.5128 - val_loss: 0.7346 - val_mae: 0.7132
Epoch 7/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13m

# NCF모델 적용

In [49]:
user_emb_weights = model.user_embedding.get_weights()

In [50]:
item_emb_weights = model.item_embedding.get_weights()

In [51]:
class NCFModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.item_embedding = layers.Embedding(num_items, embedding_dim)

        # MLP (Multi-Layer Perceptron) 부분
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(64, activation='relu')
        self.output_layer = layers.Dense(1)  # 회귀문제니까

    def call(self, inputs):
        user_input, item_input = inputs

        user_vec = self.user_embedding(user_input)
        item_vec = self.item_embedding(item_input)

        # (batch_size, embedding_dim) 로 squeeze
        user_vec = tf.squeeze(user_vec, axis=1)
        item_vec = tf.squeeze(item_vec, axis=1)

        # user와 item 벡터를 합치기
        x = tf.concat([user_vec, item_vec], axis=1)

        # MLP 통과
        x = self.dense1(x)
        x = self.dense2(x)

        # 최종 output
        output = self.output_layer(x)

        return output

In [55]:
ncf_model = NCFModel(num_users, num_items, embedding_dim=32)

dummy_user = tf.constant(np.zeros((1,1)), dtype=tf.int32)
dummy_item = tf.constant(np.zeros((1,1)), dtype=tf.int32)
ncf_model((dummy_user, dummy_item))


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.01257577]], dtype=float32)>

In [56]:
ncf_model.user_embedding.set_weights(cf_model.user_embedding.get_weights())
ncf_model.item_embedding.set_weights(cf_model.item_embedding.get_weights())

In [57]:
ncf_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = ncf_model.fit(
    x=(user_input, item_input),
    y=labels,
    batch_size=256,
    epochs=10,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 1.4351 - mae: 0.8660 - val_loss: 1.5281 - val_mae: 1.1193
Epoch 2/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.5185 - mae: 0.5163 - val_loss: 1.4821 - val_mae: 1.1017
Epoch 3/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.4975 - mae: 0.5049 - val_loss: 1.7123 - val_mae: 1.1876
Epoch 4/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.4677 - mae: 0.4874 - val_loss: 1.6367 - val_mae: 1.1610
Epoch 5/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.4242 - mae: 0.4570 - val_loss: 1.6930 - val_mae: 1.1789
Epoch 6/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.4073 - mae: 0.4457 - val_loss: 1.7106 - val_mae: 1.1846
Epoch 7/10
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step -