In [7]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, \
    mean_squared_error  # 수정: mean_squared_error 대신 root_mean_squared_error 사용

In [2]:
df = pd.read_csv('../../csv/visit_all_in_one.csv')

df_filter = df[~df['TRAVEL_MISSION_CHECK'].isnull()].copy()  # 'TRAVEL_MISSION_CHECK' 컬럼에 결측치가 없는 행만을 복사

  df = pd.read_csv('../csv/visit_all_in_one.csv')


In [3]:
# 미션 = 여행목적으로 보여짐 한 컬럼에 최대 3개가 있지만 일단 맨 처음 1개만 사용
df_filter.loc[:, 'TRAVEL_MISSION_INT'] = df_filter['TRAVEL_MISSION_CHECK'].str.split(';').str[0].astype(int)

df_learning = df_filter[[
    'GENDER',  # 성별
    'AGE_GRP',  # 연령대
    # 여행스타일
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6',
    'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1',  # 여행동기 (3까지 있음)
    'TRAVEL_COMPANIONS_NUM',  # 동반자 수
    'TRAVEL_MISSION_INT',  # 여행 목적 최우선순위
    'VISIT_AREA_NM',  # 방문지명
    'VISIT_AREA_TYPE_CD',  # 방문지 유형 코드
    'DGSTFN',  # 만족도
]]

df_learning = df_learning.dropna()

df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')
df_learning = df_learning[df_learning['VISIT_AREA_TYPE_CD'].isin(range(1, 9))]

  df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')


In [4]:
# 범주형 변수 목록 (CatBoost에 전달)
cat_features = ['GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
                'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
                'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'VISIT_AREA_TYPE_CD',
                'TRAVEL_COMPANIONS_NUM', 'TRAVEL_MISSION_INT']
df_learning[cat_features] = df_learning[cat_features].astype('int32')
X = df_learning[cat_features]
y = df_learning['DGSTFN']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
# 데이터 비율 역수 기반 가중치
class_counts = {1: 985, 2: 1862, 3: 11622, 4: 32560, 5: 39879}
total = sum(class_counts.values())
weights = np.array([total / class_counts[y] if y in class_counts else 1 for y in y_train])

param_grid = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1]
}
train_pool = Pool(X_train, y_train, cat_features=cat_features, weight=weights)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

cat_boost = CatBoostRegressor(
    task_type="GPU",
    devices='0',
    verbose=0,
    early_stopping_rounds=50,
    loss_function='MAE'
)

grid_search_result = cat_boost.grid_search(
    param_grid,
    X=train_pool,
    cv=5,
    verbose=False
)

print("최적 파라미터:", grid_search_result['params'])
print("최적 모델의 CV 점수 (최소 RMSE):", min(grid_search_result['cv_results']['test-RMSE-mean']))

Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 1.509412702
bestIteration = 499


Default metric period is 5 because MAE is/are not implemented for GPU


KeyboardInterrupt: 

In [28]:
final_model = CatBoostRegressor(
    task_type="GPU",
    devices='0',
    verbose=0,
    **grid_search_result['params']
)
final_model.fit(train_pool)
y_pred = final_model.predict(test_pool)

for score in [1, 2, 3, 4, 5]:
    mask = y_test == score
    rmse = mean_squared_error(y_test[mask], y_pred[mask], squared=False)
    print(f"{score}점 RMSE: {rmse}")

TypeError: CatBoost.grid_search() missing 1 required positional argument: 'param_grid'

In [25]:
y_pred = cat_boost.predict(test_pool)
for score in [1, 2, 3, 4, 5]:
    mask = y_test == score
    rmse = mean_squared_error(y_test[mask], y_pred[mask], squared=False)
    print(f"{score}점 RMSE: {rmse}")

1점 RMSE: 2.4548208165749226
2점 RMSE: 1.433672484215067
3점 RMSE: 0.4655178833965014
4점 RMSE: 0.5893905103025215
5점 RMSE: 1.4698931558182577




In [17]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(task_type="GPU", devices='0')
print(model.get_param('task_type'))

GPU


In [37]:
# 평가
y_pred_train = cat_boost.predict(X_train)
y_pred_test = cat_boost.predict(X_test)

In [38]:
print("훈련 점수 (R²):", r2_score(y_train, y_pred_train))
print("테스트 점수 (R²):", r2_score(y_test, y_pred_test))
print("훈련 RMSE:", root_mean_squared_error(y_train, y_pred_train, squared=False))
print("테스트 RMSE:", root_mean_squared_error(y_test, y_pred_test, squared=False))

훈련 점수 (R²): 0.27851779254813
테스트 점수 (R²): 0.05502653981425354
훈련 RMSE: 0.6913850974606907
테스트 RMSE: 0.8025361827270289




In [41]:
# K-Fold 설정 (K=5)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# 결과 저장용 리스트
train_r2_scores = []
test_r2_scores = []
train_rmse_scores = []
test_rmse_scores = []

# K-Fold 학습 및 평가
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 샘플 가중치
    sample_weights = np.where(y_train <= 3, 10, 1)

    # CatBoost 모델
    cat_model = CatBoostRegressor(
        iterations=1000,
        depth=6,
        learning_rate=0.1,
        random_seed=42,
        cat_features=cat_features,
        verbose=0  # 출력 최소화
    )

    # 학습
    cat_model.fit(X_train, y_train, sample_weight=sample_weights)

    # 예측
    y_train_pred = cat_model.predict(X_train)
    y_test_pred = cat_model.predict(X_test)

    # 평가
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = root_mean_squared_error(y_train, y_train_pred, squared=False)
    test_rmse = root_mean_squared_error(y_test, y_test_pred, squared=False)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)
    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

# 결과 출력
print("훈련 R² 평균:", np.mean(train_r2_scores), "표준편차:", np.std(train_r2_scores))
print("테스트 R² 평균:", np.mean(test_r2_scores), "표준편차:", np.std(test_r2_scores))
print("훈련 RMSE 평균:", np.mean(train_rmse_scores), "표준편차:", np.std(train_rmse_scores))
print("테스트 RMSE 평균:", np.mean(test_rmse_scores), "표준편차:", np.std(test_rmse_scores))



훈련 R² 평균: -0.5162281210233252 표준편차: 0.027407033185964703
테스트 R² 평균: -0.7155182458402003 표준편차: 0.07186657890452681
훈련 RMSE 평균: 1.0058385083830332 표준편차: 0.010234000305692393
테스트 RMSE 평균: 1.0691336903109563 표준편차: 0.010544343926380735




In [46]:
X.shape

(30772, 14)

In [49]:
best_model = grid_search.best_estimator_

# 전체 데이터 예측
y_pred = best_model.predict(X)
print("전체 데이터 R²:", r2_score(y, y_pred))
print("전체 데이터 RMSE:", root_mean_squared_error(y, y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model.fit(X_train, y_train, sample_weight=np.where(y_train <= 3, 10, 1))
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print("훈련 R²:", r2_score(y_train, y_train_pred))
print("테스트 R²:", r2_score(y_test, y_test_pred))
print("훈련 RMSE:", root_mean_squared_error(y_train, y_train_pred))
print("테스트 RMSE:", root_mean_squared_error(y_test, y_test_pred))

전체 데이터 R²: 0.21874904142807305
전체 데이터 RMSE: 0.7220314910684418
훈련 R²: 0.2732779444814256
테스트 R²: -0.09269269993083662
훈련 RMSE: 0.6942567680792459
테스트 RMSE: 0.864182679123528


In [30]:
from catboost import CatBoostClassifier, Pool
import numpy as np

# 이진 분류로 변환 (4~5점: 1, 1~3점: 0)
y_train_binary = np.where(y_train >= 4, 1, 0)
y_test_binary = np.where(y_test >= 4, 1, 0)

train_pool = Pool(X_train, y_train_binary, cat_features=cat_features)
test_pool = Pool(X_test, y_test_binary, cat_features=cat_features)

cat_boost = CatBoostClassifier(
    task_type="GPU",
    devices='0',
    verbose=0,
    early_stopping_rounds=50,
    eval_metric='F1'  # 소수 클래스(0)와 다수 클래스(1) 균형 고려
)

param_grid = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid_search_result = cat_boost.grid_search(
    param_grid,
    X=train_pool,
    cv=5,
    verbose=False
)

print("최적 파라미터:", grid_search_result['params'])
print("최적 모델의 CV 점수 (최소 Logloss):", min(grid_search_result['cv_results']['test-Logloss-mean']))

bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
bestTest = 0.9201076149
bestIteration = 0
Training on fold [0/5]
bestTest = 0.9209911173
bestIteration = 0
Training on fold [1/5]
bestTest = 0.9209911173
bestIteration = 0
Training on fold [2/5]
bestTest = 0.9209911173
bestIteration = 0
Training on fold [3/5]
bestTest = 0.9209911173
be

In [10]:
# 이진 분류로 변환 (4~5점: 1, 1~3점: 0)
y_train_binary = np.where(y_train >= 4, 1, 0)
y_test_binary = np.where(y_test >= 4, 1, 0)

train_pool = Pool(X_train, y_train_binary, cat_features=cat_features)
test_pool = Pool(X_test, y_test_binary, cat_features=cat_features)

# 최적 모델로 재학습
final_model = CatBoostClassifier(
    devices='0',
    verbose=0,
    early_stopping_rounds=50,
    eval_metric='F1',
    iterations=500,  # grid_search_result['params']['iterations']
    depth=6,  # grid_search_result['params']['depth']
    learning_rate=0.01  # grid_search_result['params']['learning_rate']
)
final_model.fit(train_pool)

# 사용자 입력 예시 (X_input: 새로운 사용자 데이터)
X_input = X_test.iloc[0:1]  # 예시로 첫 번째 테스트 데이터 사용
input_pool = Pool(X_input, cat_features=cat_features)

# 모든 여행지에 대해 예측 (가정: 여행지별로 데이터를 반복 생성)
travel_areas = range(1, 9)  # VISIT_AREA_TYPE_CD 범위
probs = []
for area in travel_areas:
    X_input['VISIT_AREA_TYPE_CD'] = area
    input_pool = Pool(X_input, cat_features=cat_features)
    prob = final_model.predict_proba(input_pool)[0][1]  # 4~5점일 확률
    probs.append((area, prob))

# 상위 10개 추천
top_10 = sorted(probs, key=lambda x: x[1], reverse=True)[:10]
print("추천 여행지 (VISIT_AREA_TYPE_CD):", [x[0] for x in top_10])

추천 여행지 (VISIT_AREA_TYPE_CD): [1, 5, 3, 6, 7, 2, 8, 4]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_input['VISIT_AREA_TYPE_CD'] = area


In [11]:
y_pred = final_model.predict(test_pool)
from sklearn.metrics import classification_report

print(classification_report(y_test_binary, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1134
           1       0.85      1.00      0.92      6559

    accuracy                           0.85      7693
   macro avg       0.43      0.50      0.46      7693
weighted avg       0.73      0.85      0.78      7693



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
