In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, \
    mean_squared_error  # 수정: mean_squared_error 대신 root_mean_squared_error 사용
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE, SMOTENC

In [3]:
df = pd.read_csv('../../csv/visit_all_in_one.csv', low_memory=False)

df_filter = df[~df['TRAVEL_MISSION_CHECK'].isnull()].copy()  # 'TRAVEL_MISSION_CHECK' 컬럼에 결측치가 없는 행만을 복사

In [4]:
# 미션 = 여행목적으로 보여짐 한 컬럼에 최대 3개가 있지만 일단 맨 처음 1개만 사용
df_filter.loc[:, 'TRAVEL_MISSION_INT'] = df_filter['TRAVEL_MISSION_CHECK'].str.split(';').str[0].astype(int)

df_learning = df_filter[[
    'GENDER',  # 성별
    'AGE_GRP',  # 연령대
    # 여행스타일
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6',
    'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1',  # 여행동기 (3까지 있음)
    'TRAVEL_COMPANIONS_NUM',  # 동반자 수
    'TRAVEL_MISSION_INT',  # 여행 목적 최우선순위
    'VISIT_AREA_NM',  # 방문지명
    'VISIT_AREA_TYPE_CD',  # 방문지 유형 코드
    'DGSTFN',  # 만족도
]]

df_learning = df_learning.dropna()

df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')
df_learning = df_learning[df_learning['VISIT_AREA_TYPE_CD'].isin(range(1, 9))]

  df_learning['GENDER'] = df_learning['GENDER'].replace({'남': 1, '여': 0}).astype('int32')


In [19]:
# 범주형 변수 목록 (CatBoost에 전달)
cat_features = ['GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
                'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7',
                'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'VISIT_AREA_TYPE_CD',
                'TRAVEL_COMPANIONS_NUM', 'TRAVEL_MISSION_INT']
df_learning[cat_features] = df_learning[cat_features].astype('int32')
X = df_learning[cat_features]
y = df_learning['DGSTFN']

In [13]:
# # 학습 데이터와 테스트 데이터 분리 (80% 학습, 20% 테스트)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# class_weights = {1.0: 10, 2.0: 5, 3.0: 2, 4.0: 1, 5.0: 1}
#
# # CatBoost 분류기 초기화
# model = CatBoostClassifier(
#     class_weights=class_weights,
#     iterations=1000,        # 반복 횟수
#     learning_rate=0.1,      # 학습률
#     depth=6,                # 트리 깊이
#     loss_function='MultiClass',  # 다중 클래스 분류
#     eval_metric='Accuracy',      # 평가 지표
#     random_seed=42,
#     verbose=100             # 학습 과정 출력 간격
# )
#
# # 모델 학습
# model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test))
#
# # 테스트 데이터로 예측
# y_pred = model.predict(X_test)
#
# # 모델 성능 평가
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.2331735	test: 0.2276168	best: 0.2276168 (0)	total: 102ms	remaining: 1m 41s
100:	learn: 0.3015309	test: 0.2512058	best: 0.2512058 (100)	total: 9.09s	remaining: 1m 20s
200:	learn: 0.3174321	test: 0.2515357	best: 0.2558182 (117)	total: 15.5s	remaining: 1m 1s
300:	learn: 0.3267609	test: 0.2561265	best: 0.2585661 (298)	total: 22.5s	remaining: 52.3s
400:	learn: 0.3367221	test: 0.2541472	best: 0.2594715 (334)	total: 29.2s	remaining: 43.6s
500:	learn: 0.3438879	test: 0.2555484	best: 0.2594715 (334)	total: 35.5s	remaining: 35.4s
600:	learn: 0.3539931	test: 0.2578886	best: 0.2594715 (334)	total: 42.2s	remaining: 28s
700:	learn: 0.3617042	test: 0.2529266	best: 0.2594715 (334)	total: 49.6s	remaining: 21.2s
800:	learn: 0.3678490	test: 0.2497489	best: 0.2594715 (334)	total: 57.1s	remaining: 14.2s
900:	learn: 0.3807880	test: 0.2523764	best: 0.2594715 (334)	total: 1m 4s	remaining: 7.05s
999:	learn: 0.4021071	test: 0.2476696	best: 0.2594715 (334)	total: 1m 13s	remaining: 0us

bestTest = 0.2

In [24]:
# SMOTE 적용
smote = SMOTE(sampling_strategy={1.0: 2000, 2.0: 2000, 3.0: 4000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 학습/테스트 데이터 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# CatBoost 분류기 초기화
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

# 모델 학습
model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test))

# 테스트 데이터로 예측
y_pred = model.predict(X_test)

# 모델 성능 평가
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.4305105	test: 0.4308231	best: 0.4308231 (0)	total: 31.8ms	remaining: 31.7s
100:	learn: 0.4879410	test: 0.4632224	best: 0.4649737 (97)	total: 13.2s	remaining: 1m 57s
200:	learn: 0.5275660	test: 0.4953298	best: 0.4956217 (196)	total: 25.2s	remaining: 1m 40s
300:	learn: 0.5545664	test: 0.5089025	best: 0.5096322 (293)	total: 37.3s	remaining: 1m 26s
400:	learn: 0.5778816	test: 0.5090485	best: 0.5126970 (354)	total: 49.6s	remaining: 1m 14s
500:	learn: 0.6033860	test: 0.5140105	best: 0.5154699 (493)	total: 1m 2s	remaining: 1m 1s
600:	learn: 0.6253877	test: 0.5179510	best: 0.5188266 (597)	total: 1m 14s	remaining: 49.3s
700:	learn: 0.6439961	test: 0.5248103	best: 0.5255400 (687)	total: 1m 26s	remaining: 36.9s
800:	learn: 0.6619477	test: 0.5296264	best: 0.5302102 (793)	total: 1m 38s	remaining: 24.6s
900:	learn: 0.6808115	test: 0.5350263	best: 0.5360479 (892)	total: 1m 51s	remaining: 12.2s
999:	learn: 0.6983617	test: 0.5375073	best: 0.5375073 (998)	total: 2m 3s	remaining: 0us

bestTes

In [25]:
X['TRAVEL_STYL_1'].unique()

array([4, 7, 6, 3, 2, 5, 1])

In [None]:
def recommend_top_travel_destinations(user_input, model, df_learning, top_n=10):
    # 사용자 입력에서 VISIT_AREA_TYPE_CD 제외
    input_features = [f for f in cat_features if f != 'VISIT_AREA_TYPE_CD']
    input_df = pd.DataFrame([user_input], columns=input_features)
    input_df[input_features] = input_df[input_features].astype('int32')

    # 고유한 VISIT_AREA_NM과 VISIT_AREA_TYPE_CD 쌍 추출
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()

    # 각 여행지에 대해 사용자 입력값과 결합
    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        prediction_data.append(combined_input)

    # 예측용 DataFrame 생성
    prediction_df = pd.DataFrame(prediction_data, columns=cat_features)
    prediction_df[cat_features] = prediction_df[cat_features].astype('int32')

    # 5.0 확률 예측
    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]  # 5.0은 인덱스 4 (1.0, 2.0, 3.0, 4.0, 5.0 순)

    # 추천 결과
    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'VISIT_AREA_TYPE_CD': destinations['VISIT_AREA_TYPE_CD'],
        'Probability_5.0': prob_5
    })
    return recommendations.nlargest(top_n, 'Probability_5.0')

In [30]:

# 예시 사용자 입력
user_input = {
    'GENDER': 1,
    'AGE_GRP': 3,
    'TRAVEL_STYL_1': 1,
    'TRAVEL_STYL_2': 2,
    'TRAVEL_STYL_3': 1,
    'TRAVEL_STYL_4': 6,
    'TRAVEL_STYL_5': 4,
    'TRAVEL_STYL_6': 2,
    'TRAVEL_STYL_7': 3,
    'TRAVEL_STYL_8': 1,
    'TRAVEL_MOTIVE_1': 1,
    'TRAVEL_COMPANIONS_NUM': 2,
    'TRAVEL_MISSION_INT': 1
}

# 추천 실행
recommendations = recommend_top_travel_destinations(user_input, model, df_learning)
print("Top 10 Recommended Destinations:")
print(recommendations)

Top 10 Recommended Destinations:
                VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
0                      프로방스마을                   7         0.386938
5                         청계천                   7         0.386938
12                    삼송역 3호선                   7         0.386938
145                  오이도 빨강등대                   7         0.386938
186                    오이도 바다                   7         0.386938
195                      물의정원                   7         0.386938
318                     금융감독원                   7         0.386938
375  서울특별시 광진구 광장동 워커힐 가는 산책로                   7         0.386938
500              마포 걷고싶은길 4코스                   7         0.386938
586               남한산성둘레길 2코스                   7         0.386938


In [31]:
# 테스트 셋에서 샘플 추출 (예: 10개)
n_samples = 10
sample_indices = np.random.choice(X_test.index, size=n_samples, replace=False)
X_test_sample = X_test.loc[sample_indices]
y_test_sample = y_test.loc[sample_indices]

# 예측 수행
y_pred_sample = model.predict(X_test_sample)
proba_sample = model.predict_proba(X_test_sample)
prob_5_sample = proba_sample[:, 4]  # 5.0 확률 (인덱스 4)

# 결과 비교를 위한 DataFrame 생성
comparison_df = pd.DataFrame({
    'Actual_Satisfaction': y_test_sample,
    'Predicted_Satisfaction': y_pred_sample.flatten(),  # 1D 배열로 변환
    'Probability_5.0': prob_5_sample
}, index=sample_indices)

# 추천 함수 (VISIT_AREA_TYPE_CD를 사용자 입력에서 제외)
def recommend_top_travel_destinations(user_input, model, df_learning, top_n=5):
    input_features = [f for f in cat_features if f != 'VISIT_AREA_TYPE_CD']
    input_df = pd.DataFrame([user_input], columns=input_features)
    input_df[input_features] = input_df[input_features].astype('int32')

    # 고유한 VISIT_AREA_NM과 VISIT_AREA_TYPE_CD 쌍 추출
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()

    # 각 여행지에 대해 사용자 입력값과 결합
    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        prediction_data.append(combined_input)

    prediction_df = pd.DataFrame(prediction_data, columns=cat_features)
    prediction_df[cat_features] = prediction_df[cat_features].astype('int32')

    # 5.0 확률 예측
    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]

    # 추천 결과
    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'VISIT_AREA_TYPE_CD': destinations['VISIT_AREA_TYPE_CD'],
        'Probability_5.0': prob_5
    })
    return recommendations.nlargest(top_n, 'Probability_5.0')

# 샘플별 예측 및 추천 출력
print("Test Set Sample Predictions vs Actual:")
print(comparison_df)
print("\nRecommendations for Each Sample:")
for idx in sample_indices:
    user_input = X_test_sample.loc[idx].to_dict()
    # VISIT_AREA_TYPE_CD는 추천 시 동적으로 추가되므로 입력에서 제외
    user_input_clean = {k: v for k, v in user_input.items() if k != 'VISIT_AREA_TYPE_CD'}
    recommendations = recommend_top_travel_destinations(user_input_clean, model, df_learning, top_n=3)
    print(f"\nSample {idx}:")
    print(f"Actual: {y_test_sample.loc[idx]}, Predicted: {y_pred_sample[sample_indices.tolist().index(idx)]}, Prob 5.0: {prob_5_sample[sample_indices.tolist().index(idx)]:.4f}")
    print("Top 3 Recommended Destinations:")
    print(recommendations)

# 전체 테스트 셋 성능 (참고용)
y_pred_full = model.predict(X_test)
print("\nFull Test Set Accuracy:", accuracy_score(y_test, y_pred_full))
print("Full Test Set Classification Report:\n", classification_report(y_test, y_pred_full))

Test Set Sample Predictions vs Actual:
       Actual_Satisfaction  Predicted_Satisfaction  Probability_5.0
29612                  4.0                     4.0         0.208904
30944                  1.0                     1.0         0.023610
3268                   4.0                     4.0         0.385208
4209                   5.0                     5.0         0.546451
27450                  5.0                     5.0         0.441858
31032                  1.0                     1.0         0.011386
22524                  4.0                     5.0         0.409238
29082                  4.0                     4.0         0.275620
27261                  5.0                     5.0         0.759519
27592                  5.0                     5.0         0.348882

Recommendations for Each Sample:

Sample 29612:
Actual: 4.0, Predicted: [4.], Prob 5.0: 0.2089
Top 3 Recommended Destinations:
     VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
285     워터플레이 수상레저           

In [33]:
# 학습 데이터에 VISIT_AREA_NM 추가
le = LabelEncoder()
X_extended = df_learning[cat_features + ['VISIT_AREA_NM']]
X_extended['VISIT_AREA_NM'] = le.fit_transform(df_learning['VISIT_AREA_NM'])
X_extended = X_extended.astype('int32')  # VISIT_AREA_NM은 인덱스나 코드로 변환 필요
cat_features_extended = cat_features + ['VISIT_AREA_NM']

# SMOTE 적용 및 학습
smote = SMOTE(sampling_strategy={1.0: 2000, 2.0: 2000, 3.0: 4000}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_extended, y)
X_resampled = np.round(X_resampled).astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, cat_features=cat_features_extended, eval_set=(X_test, y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_extended['VISIT_AREA_NM'] = le.fit_transform(df_learning['VISIT_AREA_NM'])


0:	learn: 0.4598460	test: 0.4600117	best: 0.4600117 (0)	total: 72.6ms	remaining: 1m 12s
100:	learn: 0.5368701	test: 0.5255400	best: 0.5255400 (100)	total: 15.1s	remaining: 2m 14s
200:	learn: 0.5619002	test: 0.5407180	best: 0.5411559 (186)	total: 28.1s	remaining: 1m 51s
300:	learn: 0.5821505	test: 0.5477233	best: 0.5477233 (287)	total: 41s	remaining: 1m 35s
400:	learn: 0.6032400	test: 0.5537069	best: 0.5537069 (400)	total: 54.3s	remaining: 1m 21s
500:	learn: 0.6205714	test: 0.5560420	best: 0.5564799 (498)	total: 1m 7s	remaining: 1m 7s
600:	learn: 0.6391068	test: 0.5564799	best: 0.5576474 (529)	total: 1m 20s	remaining: 53.5s
700:	learn: 0.6558543	test: 0.5580852	best: 0.5583771 (619)	total: 1m 33s	remaining: 39.9s
800:	learn: 0.6712884	test: 0.5576474	best: 0.5605663 (758)	total: 1m 46s	remaining: 26.5s
900:	learn: 0.6889116	test: 0.5608581	best: 0.5623176 (885)	total: 2m	remaining: 13.2s
999:	learn: 0.7052213	test: 0.5631932	best: 0.5645067 (960)	total: 2m 13s	remaining: 0us

bestTest =

<catboost.core.CatBoostClassifier at 0x1f73e12d4f0>

In [35]:
def recommend_top_travel_destinations(user_input, model, df_learning, top_n=10):
    # 고유 VISIT_AREA_NM과 VISIT_AREA_TYPE_CD
    destinations = df_learning[['VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD']].drop_duplicates()
    destinations['VISIT_AREA_NM_CODE'] = le.transform(destinations['VISIT_AREA_NM'])

    # 예측 데이터 생성
    prediction_data = []
    for _, row in destinations.iterrows():
        combined_input = user_input.copy()
        combined_input['VISIT_AREA_TYPE_CD'] = row['VISIT_AREA_TYPE_CD']
        combined_input['VISIT_AREA_NM'] = row['VISIT_AREA_NM_CODE']
        prediction_data.append(combined_input)

    prediction_df = pd.DataFrame(prediction_data, columns=cat_features_extended)
    prediction_df[cat_features_extended] = prediction_df[cat_features_extended].astype('int32')

    # 5.0 확률 예측
    proba = model.predict_proba(prediction_df)
    prob_5 = proba[:, 4]

    # 추천 결과
    recommendations = pd.DataFrame({
        'VISIT_AREA_NM': destinations['VISIT_AREA_NM'],
        'VISIT_AREA_TYPE_CD': destinations['VISIT_AREA_TYPE_CD'],
        'Probability_5.0': prob_5
    })
    return recommendations.nlargest(top_n, 'Probability_5.0')

# 예시 사용자 입력
user_input = {
    'GENDER': 1, 'AGE_GRP': 3, 'TRAVEL_STYL_1': 1, 'TRAVEL_STYL_2': 2,
    'TRAVEL_STYL_3': 1, 'TRAVEL_STYL_4': 2, 'TRAVEL_STYL_5': 1,
    'TRAVEL_STYL_6': 1, 'TRAVEL_STYL_7': 1, 'TRAVEL_STYL_8': 2,
    'TRAVEL_MOTIVE_1': 1, 'TRAVEL_COMPANIONS_NUM': 2, 'TRAVEL_MISSION_INT': 1
}
recommendations = recommend_top_travel_destinations(user_input, model, df_learning)
print(recommendations)

       VISIT_AREA_NM  VISIT_AREA_TYPE_CD  Probability_5.0
31651   태화강국가정원 십리대숲                   7         0.408533
103015       한라산국립공원                   1         0.403646
760              덕수궁                   2         0.391664
70438    전북대학교 전주캠퍼스                   7         0.381596
106101       서우봉 산책로                   7         0.379590
115855            우도                   7         0.378461
110989         스누피가든                   7         0.371141
102878        쏠비치 진도                   7         0.367684
30773   태화강국가정원 십리대숲                   1         0.363457
68313         담양관방제림                   7         0.361022
