In [1]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [2]:
# 데이터 로드 및 전처리
df = pd.read_csv('../../csv/visit_all_in_one.csv', low_memory=False)
df_filter = df[~df['TRAVEL_MISSION_CHECK'].isnull()].copy()
df_filter['TRAVEL_MISSION_INT'] = df_filter['TRAVEL_MISSION_CHECK'].str.split(';').str[0].astype(int)

In [8]:
# 학습 데이터 준비
df_learning = df_filter[[
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'TRAVEL_MISSION_INT', 'VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD', 'DGSTFN'
]].dropna()

df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')
df_learning = df_learning[df_learning['VISIT_AREA_TYPE_CD'].isin(range(1, 9))]

# 범주형 변수 목록
cat_features = [
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'VISIT_AREA_TYPE_CD',
    'TRAVEL_COMPANIONS_NUM', 'TRAVEL_MISSION_INT'
]

df_learning['TRAVEL_COMPANIONS_NUM'].unique()

  df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')


array([ 0.,  1.,  3.,  2.,  4.,  5.,  7.,  6., 11.,  9., 20.,  8., 14.,
       19., 10., 12., 13., 16., 15., 17.])

In [11]:
# VISIT_AREA_NM을 숫자 코드로 변환
le = LabelEncoder()
df_learning['VISIT_AREA_NM_CODE'] = le.fit_transform(df_learning['VISIT_AREA_NM'])

# 학습에 사용할 특징
cat_features_extended = cat_features + ['VISIT_AREA_NM_CODE']
X = df_learning[cat_features_extended].astype('int32')
y = df_learning['DGSTFN']

# SMOTE 적용
# 3.0 샘플 늘려 클래스 간 경계 명확화:
smote = SMOTE(sampling_strategy={1.0: 2000, 2.0: 2000, 3.0: 6000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled = np.round(X_resampled).astype('int32')

# 학습/테스트 데이터 분리
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# 학습 데이터에서 검증 셋 분리 (early_stopping_rounds용)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

In [13]:
# 하이퍼파라미터 그리드 정의 (l2_leaf_reg 증가, 범위 축소)
param_grid = {
    'iterations': [400, 500, 600],  # 500 근처 탐색
    'learning_rate': [0.05, 0.1],   # 0.1 근처 탐색
    'depth': [8],                   # 최적값 고정
    'l2_leaf_reg': [5, 7, 10]      # 5~10으로 증가
}

class_weights = {1.0: 1, 2.0: 1, 3.0: 2, 4.0: 1, 5.0: 1.5}  # 5.0에 높은 가중치
# CatBoost 모델 초기화 (early_stopping_rounds 추가)
base_model = CatBoostClassifier(
    loss_function='MultiClass', # MultiClass 대신 MultiClassOneVsAll 사용 → 5.0을 이진 분류처럼 강화:
    eval_metric='Accuracy',
    cat_features=cat_features_extended,
    random_seed=42,
    task_type='GPU',
    early_stopping_rounds=50,  # 50번 동안 개선 없으면 중단
    class_weights=class_weights,
    verbose=100
)

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=1,  # GPU 사용 시 병렬 처리는 단일 작업으로 (설명 아래 참고)
    verbose=2
)

# GridSearchCV 실행 (검증 셋 전달)
print("Starting GridSearchCV with early stopping and increased l2_leaf_reg...")
grid_search.fit(X_train, y_train, eval_set=(X_val, y_val))

# 최적 파라미터 및 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# 테스트 셋으로 최종 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Starting GridSearchCV with early stopping and increased l2_leaf_reg...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	learn: 0.4973564	test: 0.4927918	best: 0.4927918 (0)	total: 22.8ms	remaining: 9.08s
100:	learn: 0.5316291	test: 0.5055785	best: 0.5055785 (100)	total: 1.86s	remaining: 5.5s
200:	learn: 0.5690412	test: 0.5137270	best: 0.5140404 (199)	total: 3.58s	remaining: 3.54s
300:	learn: 0.6065477	test: 0.5184907	best: 0.5194309 (273)	total: 5.32s	remaining: 1.75s
399:	learn: 0.6447151	test: 0.5208098	best: 0.5219381 (380)	total: 7.03s	remaining: 0us
bestTest = 0.521938072
bestIteration = 380
Shrink model to first 381 iterations.
[CV] END depth=8, iterations=400, l2_leaf_reg=5, learning_rate=0.05; total time=   7.8s
0:	learn: 0.4932257	test: 0.4880281	best: 0.4880281 (0)	total: 20.3ms	remaining: 8.12s
100:	learn: 0.5250673	test: 0.5039489	best: 0.5041369 (98)	total: 1.82s	remaining: 5.39s
200:	learn: 0.5621253	test: 0.5092140	best: 0.5103422 (179)	total: 3.74s	remain

In [15]:
def recommend_top_destinations(user_input, model, df_learning, le, top_n=10, threshold=0.7):
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()
    destinations['VISIT_AREA_NM_CODE'] = le.transform(destinations['VISIT_AREA_NM'])

    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        combined_input['VISIT_AREA_NM_CODE'] = row['VISIT_AREA_NM_CODE']
        prediction_data.append(combined_input)

    prediction_df = pd.DataFrame(prediction_data, columns=cat_features_extended)
    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]  # 5.0 확률

    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'Probability_5.0': prob_5
    })
    return recommendations[recommendations['Probability_5.0'] >= threshold].nlargest(top_n, 'Probability_5.0')

In [23]:
proba = best_model.predict_proba(X_test)
test_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': best_model.predict(X_test).flatten(),
    'Prob_5.0': proba[:, 4]
})
test_5 = test_df[test_df['Actual'] == 5.0]
print("5.0 샘플 통계:")
print(test_5['Prob_5.0'].describe())
print("5.0 예측 비율:", (test_5['Predicted'] == 5.0).mean())

# 추가: 전체 테스트 셋의 5.0 확률 분포
print("\n전체 테스트 셋 5.0 확률 통계:")
print(test_df['Prob_5.0'].describe())

5.0 샘플 통계:
count    2921.000000
mean        0.542599
std         0.135862
min         0.156269
25%         0.442283
50%         0.538692
75%         0.639121
max         0.919522
Name: Prob_5.0, dtype: float64
5.0 예측 비율: 0.9024306744265662

전체 테스트 셋 5.0 확률 통계:
count    7252.000000
mean        0.451783
std         0.191623
min         0.001103
25%         0.360545
50%         0.471795
75%         0.578717
max         0.919522
Name: Prob_5.0, dtype: float64


In [32]:
# 추천 예시
user_input = {
    'GENDER': 1, 'AGE_GRP': 30, 'TRAVEL_STYL_1': 1, 'TRAVEL_STYL_2': 2,
    'TRAVEL_STYL_3': 1, 'TRAVEL_STYL_4': 4, 'TRAVEL_STYL_5': 5,
    'TRAVEL_STYL_6': 3, 'TRAVEL_STYL_7': 6, 'TRAVEL_STYL_8': 2,
    'TRAVEL_MOTIVE_1': 1, 'TRAVEL_COMPANIONS_NUM': 2, 'TRAVEL_MISSION_INT': 1
}
for feature in cat_features_extended:
    print(f"{feature} 범위: {X[feature].min()} ~ {X[feature].max()}, 입력값: {user_input.get(feature, '없음')}")

recs = recommend_top_destinations(user_input, best_model, df_learning, le, threshold=0.0)

print("Top Recommendations for 5.0 (threshold=0.7):")
print(recs)

GENDER 범위: 0 ~ 1, 입력값: 1
AGE_GRP 범위: 20 ~ 60, 입력값: 30
TRAVEL_STYL_1 범위: 1 ~ 7, 입력값: 1
TRAVEL_STYL_2 범위: 1 ~ 7, 입력값: 2
TRAVEL_STYL_3 범위: 1 ~ 7, 입력값: 1
TRAVEL_STYL_4 범위: 1 ~ 7, 입력값: 4
TRAVEL_STYL_5 범위: 1 ~ 7, 입력값: 5
TRAVEL_STYL_6 범위: 1 ~ 7, 입력값: 3
TRAVEL_STYL_7 범위: 1 ~ 7, 입력값: 6
TRAVEL_STYL_8 범위: 1 ~ 7, 입력값: 2
TRAVEL_MOTIVE_1 범위: 1 ~ 10, 입력값: 1
VISIT_AREA_TYPE_CD 범위: 1 ~ 8, 입력값: 없음
TRAVEL_COMPANIONS_NUM 범위: 0 ~ 20, 입력값: 2
TRAVEL_MISSION_INT 범위: 1 ~ 28, 입력값: 1
VISIT_AREA_NM_CODE 범위: 0 ~ 10099, 입력값: 없음
Top Recommendations for 5.0 (threshold=0.7):
        VISIT_AREA_NM  Probability_5.0
45705         만리포해수욕장         0.660161
109419           서빈백사         0.657755
5610            서울대공원         0.657419
102161          따라비오름         0.657419
103810      거제식물원 정글돔         0.657419
113300  한라산국립공원 영실탐방로         0.657419
10994           서울대공원         0.652171
68067       유월드루지테마파크         0.652171
146362      거제식물원 정글돔         0.652171
760               덕수궁         0.651033


In [33]:
best_model.save_model('catboost_model.cbm')

In [34]:
import pickle
# LabelEncoder 저장
with open('../label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
# df_learning 저장
df_learning.to_csv('df_learning.csv', index=False)

In [5]:

# CatBoost 모델 학습
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, cat_features=cat_features_extended, eval_set=(X_test, y_test))

# 전체 테스트 셋 성능 평가
y_pred = model.predict(X_test)
print("Full Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.4598460	test: 0.4600117	best: 0.4600117 (0)	total: 205ms	remaining: 3m 25s
100:	learn: 0.5368701	test: 0.5255400	best: 0.5255400 (100)	total: 12.6s	remaining: 1m 51s
200:	learn: 0.5619002	test: 0.5407180	best: 0.5411559 (186)	total: 25s	remaining: 1m 39s
300:	learn: 0.5821505	test: 0.5477233	best: 0.5477233 (287)	total: 37.6s	remaining: 1m 27s
400:	learn: 0.6032400	test: 0.5537069	best: 0.5537069 (400)	total: 50.4s	remaining: 1m 15s
500:	learn: 0.6205714	test: 0.5560420	best: 0.5564799 (498)	total: 1m 3s	remaining: 1m 2s
600:	learn: 0.6391068	test: 0.5564799	best: 0.5576474 (529)	total: 1m 15s	remaining: 50.4s
700:	learn: 0.6558543	test: 0.5580852	best: 0.5583771 (619)	total: 1m 28s	remaining: 37.9s
800:	learn: 0.6712884	test: 0.5576474	best: 0.5605663 (758)	total: 1m 41s	remaining: 25.3s
900:	learn: 0.6889116	test: 0.5608581	best: 0.5623176 (885)	total: 1m 54s	remaining: 12.6s
999:	learn: 0.7052213	test: 0.5631932	best: 0.5645067 (960)	total: 2m 7s	remaining: 0us

bestTest

In [6]:

# 추천 함수
def recommend_top_travel_destinations(user_input, model, df_learning, top_n=10):
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()
    destinations['VISIT_AREA_NM_CODE'] = le.transform(destinations['VISIT_AREA_NM'])

    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        combined_input['VISIT_AREA_NM_CODE'] = row['VISIT_AREA_NM_CODE']
        prediction_data.append(combined_input)

    prediction_df = pd.DataFrame(prediction_data, columns=cat_features_extended)
    prediction_df[cat_features_extended] = prediction_df[cat_features_extended].astype('int32')

    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]  # 5.0 확률 (인덱스 4)

    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'VISIT_AREA_TYPE_CD': destinations['VISIT_AREA_TYPE_CD'],
        'Probability_5.0': prob_5
    })
    return recommendations.nlargest(top_n, 'Probability_5.0')

In [7]:

# 예시 사용자 입력
user_input = {
    'GENDER': 1, 'AGE_GRP': 3, 'TRAVEL_STYL_1': 1, 'TRAVEL_STYL_2': 2,
    'TRAVEL_STYL_3': 1, 'TRAVEL_STYL_4': 2, 'TRAVEL_STYL_5': 1,
    'TRAVEL_STYL_6': 1, 'TRAVEL_STYL_7': 1, 'TRAVEL_STYL_8': 2,
    'TRAVEL_MOTIVE_1': 1, 'TRAVEL_COMPANIONS_NUM': 2, 'TRAVEL_MISSION_INT': 1
}

# 추천 실행
recommendations = recommend_top_travel_destinations(user_input, model, df_learning)
print("\nTop 10 Recommended Destinations:")
print(recommendations)

# 테스트 셋 샘플 예측
n_samples = 10
sample_indices = np.random.choice(X_test.index, size=n_samples, replace=False)
X_test_sample = X_test.loc[sample_indices]
y_test_sample = y_test.loc[sample_indices]

y_pred_sample = model.predict(X_test_sample)
proba_sample = model.predict_proba(X_test_sample)
prob_5_sample = proba_sample[:, 4]

comparison_df = pd.DataFrame({
    'Actual_Satisfaction': y_test_sample,
    'Predicted_Satisfaction': y_pred_sample.flatten(),
    'Probability_5.0': prob_5_sample
}, index=sample_indices)


Top 10 Recommended Destinations:
       VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
31651   태화강국가정원 십리대숲                   7         0.408533
103015       한라산국립공원                   1         0.403646
760              덕수궁                   2         0.391664
70438    전북대학교 전주캠퍼스                   7         0.381596
106101       서우봉 산책로                   7         0.379590
115855            우도                   7         0.378461
110989         스누피가든                   7         0.371141
102878        쏠비치 진도                   7         0.367684
30773   태화강국가정원 십리대숲                   1         0.363457
68313         담양관방제림                   7         0.361022


In [8]:

print("\nTest Set Sample Predictions vs Actual:")
print(comparison_df)

print("\nRecommendations for Each Sample:")
for idx in sample_indices:
    user_input = X_test_sample.loc[idx].to_dict()
    user_input_clean = {k: v for k, v in user_input.items() if k != 'VISIT_AREA_TYPE_CD'}
    recommendations = recommend_top_travel_destinations(user_input_clean, model, df_learning, top_n=3)
    print(f"\nSample {idx}:")
    print(f"Actual: {y_test_sample.loc[idx]}, Predicted: {y_pred_sample[sample_indices.tolist().index(idx)]}, Prob 5.0: {prob_5_sample[sample_indices.tolist().index(idx)]:.4f}")
    print("Top 3 Recommended Destinations:")
    print(recommendations)


Test Set Sample Predictions vs Actual:
       Actual_Satisfaction  Predicted_Satisfaction  Probability_5.0
11227                  4.0                     5.0         0.667212
25756                  3.0                     4.0         0.343412
13022                  4.0                     5.0         0.458636
26540                  2.0                     4.0         0.163053
7212                   5.0                     5.0         0.850229
13722                  5.0                     5.0         0.603735
22926                  5.0                     5.0         0.797065
30408                  4.0                     4.0         0.228389
8087                   4.0                     5.0         0.579189
10917                  5.0                     5.0         0.675757

Recommendations for Each Sample:

Sample 11227:
Actual: 4.0, Predicted: [5.], Prob 5.0: 0.6672
Top 3 Recommended Destinations:
       VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
113270       한라산국립공원      