In [48]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error  # 수정: mean_squared_error 대신 root_mean_squared_error 사용

In [29]:
df = pd.read_csv('../csv/visit_all_in_one.csv')

df_filter = df[~df['TRAVEL_MISSION_CHECK'].isnull()].copy()  # 'TRAVEL_MISSION_CHECK' 컬럼에 결측치가 없는 행만을 복사

  df = pd.read_csv('../csv/visit_all_in_one.csv')


In [30]:

# 미션 = 여행목적으로 보여짐 한 컬럼에 최대 3개가 있지만 일단 맨 처음 1개만 사용
df_filter.loc[:, 'TRAVEL_MISSION_INT'] = df_filter['TRAVEL_MISSION_CHECK'].str.split(';').str[0].astype(int)

df_learning = df_filter[[
    'GENDER',  # 성별
    'AGE_GRP',  # 연령대
    # 여행스타일
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6',
    'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1',  # 여행동기 (3까지 있음)
    'TRAVEL_COMPANIONS_NUM',  # 동반자 수
    'TRAVEL_MISSION_INT',  # 여행 목적 최우선순위
    'VISIT_AREA_NM',  # 방문지명
    'VISIT_AREA_TYPE_CD', # 방문지 유형 코드
    'DGSTFN',  # 만족도
]]

df_learning = df_learning.dropna()

df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')
df_learning = df_learning[df_learning['VISIT_AREA_TYPE_CD'].isin(range(1, 9))]

  df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')


In [31]:
# 범주형 변수 목록 (CatBoost에 전달)
cat_features = ['GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
                'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
                'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'VISIT_AREA_TYPE_CD',
                'TRAVEL_COMPANIONS_NUM', 'TRAVEL_MISSION_INT']
df_learning[cat_features] = df_learning[cat_features].astype('int32')
X = df_learning[cat_features]
y = df_learning['DGSTFN']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

param_grid = {
    'iterations': [500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1]
}

In [44]:
cat_boost = CatBoostRegressor(cat_features=cat_features, verbose=0)
# cat_boost.fit(X_train, y_train)

grid_search = GridSearchCV(cat_boost, param_grid, cv=5, scoring='r2')
grid_search.fit(X, y, sample_weight=np.where(y <= 3, 10, 1))
print("최적 파라미터:", grid_search.best_params_)

최적 파라미터: {'depth': 10, 'iterations': 1000, 'learning_rate': 0.1}


In [37]:
# 평가
y_pred_train = cat_boost.predict(X_train)
y_pred_test = cat_boost.predict(X_test)

In [38]:
print("훈련 점수 (R²):", r2_score(y_train, y_pred_train))
print("테스트 점수 (R²):", r2_score(y_test, y_pred_test))
print("훈련 RMSE:", root_mean_squared_error(y_train, y_pred_train, squared=False))
print("테스트 RMSE:", root_mean_squared_error(y_test, y_pred_test, squared=False))

훈련 점수 (R²): 0.27851779254813
테스트 점수 (R²): 0.05502653981425354
훈련 RMSE: 0.6913850974606907
테스트 RMSE: 0.8025361827270289




In [41]:
# K-Fold 설정 (K=5)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# 결과 저장용 리스트
train_r2_scores = []
test_r2_scores = []
train_rmse_scores = []
test_rmse_scores = []

# K-Fold 학습 및 평가
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 샘플 가중치
    sample_weights = np.where(y_train <= 3, 10, 1)

    # CatBoost 모델
    cat_model = CatBoostRegressor(
        iterations=1000,
        depth=6,
        learning_rate=0.1,
        random_seed=42,
        cat_features=cat_features,
        verbose=0  # 출력 최소화
    )

    # 학습
    cat_model.fit(X_train, y_train, sample_weight=sample_weights)

    # 예측
    y_train_pred = cat_model.predict(X_train)
    y_test_pred = cat_model.predict(X_test)

    # 평가
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = root_mean_squared_error(y_train, y_train_pred, squared=False)
    test_rmse = root_mean_squared_error(y_test, y_test_pred, squared=False)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)
    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

# 결과 출력
print("훈련 R² 평균:", np.mean(train_r2_scores), "표준편차:", np.std(train_r2_scores))
print("테스트 R² 평균:", np.mean(test_r2_scores), "표준편차:", np.std(test_r2_scores))
print("훈련 RMSE 평균:", np.mean(train_rmse_scores), "표준편차:", np.std(train_rmse_scores))
print("테스트 RMSE 평균:", np.mean(test_rmse_scores), "표준편차:", np.std(test_rmse_scores))



훈련 R² 평균: -0.5162281210233252 표준편차: 0.027407033185964703
테스트 R² 평균: -0.7155182458402003 표준편차: 0.07186657890452681
훈련 RMSE 평균: 1.0058385083830332 표준편차: 0.010234000305692393
테스트 RMSE 평균: 1.0691336903109563 표준편차: 0.010544343926380735




In [46]:
X.shape

(30772, 14)

In [None]:
best_model = grid_search.best_estimator_

# 전체 데이터 예측
y_pred = best_model.predict(X)
print("전체 데이터 R²:", r2_score(y, y_pred))
print("전체 데이터 RMSE:", root_mean_squared_error(y, y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model.fit(X_train, y_train, sample_weight=np.where(y_train <= 3, 10, 1))
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print("훈련 R²:", r2_score(y_train, y_train_pred))
print("테스트 R²:", r2_score(y_test, y_test_pred))
print("훈련 RMSE:", root_mean_squared_error(y_train, y_train_pred))
print("테스트 RMSE:", root_mean_squared_error(y_test, y_test_pred))

전체 데이터 R²: 0.21874904142807305
전체 데이터 RMSE: 0.7220314910684418
