In [11]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [2]:
# 데이터 로드 및 전처리
df = pd.read_csv('../csv/visit_all_in_one.csv', low_memory=False)
df_filter = df[~df['TRAVEL_MISSION_CHECK'].isnull()].copy()
df_filter['TRAVEL_MISSION_INT'] = df_filter['TRAVEL_MISSION_CHECK'].str.split(';').str[0].astype(int)

In [3]:
# 학습 데이터 준비
df_learning = df_filter[[
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'TRAVEL_MISSION_INT', 'VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD', 'DGSTFN'
]].dropna()

df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')
df_learning = df_learning[df_learning['VISIT_AREA_TYPE_CD'].isin(range(1, 9))]

# 범주형 변수 목록
cat_features = [
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'VISIT_AREA_TYPE_CD',
    'TRAVEL_COMPANIONS_NUM', 'TRAVEL_MISSION_INT'
]

  df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')


In [4]:
# VISIT_AREA_NM을 숫자 코드로 변환
le = LabelEncoder()
df_learning['VISIT_AREA_NM_CODE'] = le.fit_transform(df_learning['VISIT_AREA_NM'])

# 학습에 사용할 특징
cat_features_extended = cat_features + ['VISIT_AREA_NM_CODE']
X = df_learning[cat_features_extended].astype('int32')
y = df_learning['DGSTFN']

# SMOTE 적용
smote = SMOTE(sampling_strategy={1.0: 2000, 2.0: 2000, 3.0: 4000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled = np.round(X_resampled).astype('int32')

# 학습/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [12]:
# 하이퍼파라미터 그리드 정의
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.1, 0.3],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

# CatBoost 모델 초기화
base_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='Accuracy',
    cat_features=cat_features_extended,
    random_seed=42,
    verbose=0
)

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# GridSearchCV 실행
print("Starting GridSearchCV (Es timated time: 24~27 minutes)...")
grid_search.fit(X_train, y_train)

# 최적 파라미터 및 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Starting GridSearchCV (Es timated time: 24~27 minutes)...
Fitting 3 folds for each of 54 candidates, totalling 162 fits


KeyboardInterrupt: 

In [5]:

# CatBoost 모델 학습
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, cat_features=cat_features_extended, eval_set=(X_test, y_test))

# 전체 테스트 셋 성능 평가
y_pred = model.predict(X_test)
print("Full Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.4598460	test: 0.4600117	best: 0.4600117 (0)	total: 205ms	remaining: 3m 25s
100:	learn: 0.5368701	test: 0.5255400	best: 0.5255400 (100)	total: 12.6s	remaining: 1m 51s
200:	learn: 0.5619002	test: 0.5407180	best: 0.5411559 (186)	total: 25s	remaining: 1m 39s
300:	learn: 0.5821505	test: 0.5477233	best: 0.5477233 (287)	total: 37.6s	remaining: 1m 27s
400:	learn: 0.6032400	test: 0.5537069	best: 0.5537069 (400)	total: 50.4s	remaining: 1m 15s
500:	learn: 0.6205714	test: 0.5560420	best: 0.5564799 (498)	total: 1m 3s	remaining: 1m 2s
600:	learn: 0.6391068	test: 0.5564799	best: 0.5576474 (529)	total: 1m 15s	remaining: 50.4s
700:	learn: 0.6558543	test: 0.5580852	best: 0.5583771 (619)	total: 1m 28s	remaining: 37.9s
800:	learn: 0.6712884	test: 0.5576474	best: 0.5605663 (758)	total: 1m 41s	remaining: 25.3s
900:	learn: 0.6889116	test: 0.5608581	best: 0.5623176 (885)	total: 1m 54s	remaining: 12.6s
999:	learn: 0.7052213	test: 0.5631932	best: 0.5645067 (960)	total: 2m 7s	remaining: 0us

bestTest

In [6]:

# 추천 함수
def recommend_top_travel_destinations(user_input, model, df_learning, top_n=10):
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()
    destinations['VISIT_AREA_NM_CODE'] = le.transform(destinations['VISIT_AREA_NM'])

    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        combined_input['VISIT_AREA_NM_CODE'] = row['VISIT_AREA_NM_CODE']
        prediction_data.append(combined_input)

    prediction_df = pd.DataFrame(prediction_data, columns=cat_features_extended)
    prediction_df[cat_features_extended] = prediction_df[cat_features_extended].astype('int32')

    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]  # 5.0 확률 (인덱스 4)

    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'VISIT_AREA_TYPE_CD': destinations['VISIT_AREA_TYPE_CD'],
        'Probability_5.0': prob_5
    })
    return recommendations.nlargest(top_n, 'Probability_5.0')

In [7]:

# 예시 사용자 입력
user_input = {
    'GENDER': 1, 'AGE_GRP': 3, 'TRAVEL_STYL_1': 1, 'TRAVEL_STYL_2': 2,
    'TRAVEL_STYL_3': 1, 'TRAVEL_STYL_4': 2, 'TRAVEL_STYL_5': 1,
    'TRAVEL_STYL_6': 1, 'TRAVEL_STYL_7': 1, 'TRAVEL_STYL_8': 2,
    'TRAVEL_MOTIVE_1': 1, 'TRAVEL_COMPANIONS_NUM': 2, 'TRAVEL_MISSION_INT': 1
}

# 추천 실행
recommendations = recommend_top_travel_destinations(user_input, model, df_learning)
print("\nTop 10 Recommended Destinations:")
print(recommendations)

# 테스트 셋 샘플 예측
n_samples = 10
sample_indices = np.random.choice(X_test.index, size=n_samples, replace=False)
X_test_sample = X_test.loc[sample_indices]
y_test_sample = y_test.loc[sample_indices]

y_pred_sample = model.predict(X_test_sample)
proba_sample = model.predict_proba(X_test_sample)
prob_5_sample = proba_sample[:, 4]

comparison_df = pd.DataFrame({
    'Actual_Satisfaction': y_test_sample,
    'Predicted_Satisfaction': y_pred_sample.flatten(),
    'Probability_5.0': prob_5_sample
}, index=sample_indices)


Top 10 Recommended Destinations:
       VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
31651   태화강국가정원 십리대숲                   7         0.408533
103015       한라산국립공원                   1         0.403646
760              덕수궁                   2         0.391664
70438    전북대학교 전주캠퍼스                   7         0.381596
106101       서우봉 산책로                   7         0.379590
115855            우도                   7         0.378461
110989         스누피가든                   7         0.371141
102878        쏠비치 진도                   7         0.367684
30773   태화강국가정원 십리대숲                   1         0.363457
68313         담양관방제림                   7         0.361022


In [8]:

print("\nTest Set Sample Predictions vs Actual:")
print(comparison_df)

print("\nRecommendations for Each Sample:")
for idx in sample_indices:
    user_input = X_test_sample.loc[idx].to_dict()
    user_input_clean = {k: v for k, v in user_input.items() if k != 'VISIT_AREA_TYPE_CD'}
    recommendations = recommend_top_travel_destinations(user_input_clean, model, df_learning, top_n=3)
    print(f"\nSample {idx}:")
    print(f"Actual: {y_test_sample.loc[idx]}, Predicted: {y_pred_sample[sample_indices.tolist().index(idx)]}, Prob 5.0: {prob_5_sample[sample_indices.tolist().index(idx)]:.4f}")
    print("Top 3 Recommended Destinations:")
    print(recommendations)


Test Set Sample Predictions vs Actual:
       Actual_Satisfaction  Predicted_Satisfaction  Probability_5.0
11227                  4.0                     5.0         0.667212
25756                  3.0                     4.0         0.343412
13022                  4.0                     5.0         0.458636
26540                  2.0                     4.0         0.163053
7212                   5.0                     5.0         0.850229
13722                  5.0                     5.0         0.603735
22926                  5.0                     5.0         0.797065
30408                  4.0                     4.0         0.228389
8087                   4.0                     5.0         0.579189
10917                  5.0                     5.0         0.675757

Recommendations for Each Sample:

Sample 11227:
Actual: 4.0, Predicted: [5.], Prob 5.0: 0.6672
Top 3 Recommended Destinations:
       VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
113270       한라산국립공원      