In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest
import lightgbm as lgb
import numpy as np

# 데이터 읽기
museum_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/1. 문화시설/박물관전처리.csv'
culture_place_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/1. 문화시설/실내문화공간전처리.csv'
forest_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/2. 자연환경/forest_count_data.csv'
beach_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/2. 자연환경/modified_beach_data.csv'
accommodation_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/3. 숙박 시설/Final_Region_Facility_Counts_with_Renamed_Columns.csv'
shop_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/4. 편의 시설/아이 동반 쇼핑시설 지역별 시설 개수.csv'
park_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/4. 편의 시설/주차장 지역별 개수.csv'
play_fille_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/4. 편의 시설/아이 동반 놀이 시설 지역별 개수.csv'
trip_file_path = '/content/drive/MyDrive/문화 공모전/전처리 완료 데이터/국민여행조사 지역별 방문횟수 데이터.csv'

museum_data = pd.read_csv(museum_file_path)
culture_place_data = pd.read_csv(culture_place_file_path)
forest_data = pd.read_csv(forest_file_path)
beach_data = pd.read_csv(beach_file_path)
accommodation_data = pd.read_csv(accommodation_file_path)
shop_data = pd.read_csv(shop_file_path)
park_data = pd.read_csv(park_file_path)
play_data = pd.read_csv(play_fille_path)
data = pd.read_csv(trip_file_path)

# 데이터프레임 리스트 생성
dataframes = [museum_data, culture_place_data, forest_data, beach_data,
              accommodation_data, shop_data, park_data, play_data, data]

# 병합할 때 사용하는 함수
def merge_dataframes_on_region(df_list):
    # 가장 많은 지역명을 가진 데이터프레임을 기준으로 병합 시작
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(merged_df, df, on='지역', how='outer', suffixes=('', '_drop'))
        print(merged_df.head())
    return merged_df.fillna(0)  # 결측값을 0으로 채우기

# 데이터 병합 (지역을 기준으로)
facilities_data = merge_dataframes_on_region(dataframes)

# 중간 결과 확인
print(facilities_data.head())
print(f"Number of columns after merging: {len(facilities_data.columns)}")

# 불필요한 열 제거
if 'Unnamed: 0' in facilities_data.columns:
    facilities_data.drop(columns=['Unnamed: 0'], inplace=True)

# 컬럼명 변경
facilities_data.columns = ['지역', '박물관 개수', '실내 문화공간 개수',  '휴양림 개수', '해변 개수', '숙소 개수',
                           '아이 동반 쇼핑 시설 개수', '주차장 개수', '아이 동반 놀이 시설 개수', '여행건 개수']

# 독립변수(X) 설정
X = facilities_data[['박물관 개수', '실내 문화공간 개수',  '휴양림 개수', '해변 개수', '숙소 개수',
                     '아이 동반 쇼핑 시설 개수', '주차장 개수', '아이 동반 놀이 시설 개수', ]]

X

       지역    개수  개수_drop
0  강원 강릉시  41.0     39.0
1  강원 고성군  11.0      6.0
2  강원 동해시   3.0     11.0
3  강원 삼척시   5.0     15.0
4  강원 속초시   4.0     17.0
       지역    개수  개수_drop  개수_drop
0  강원 강릉시  41.0     39.0      1.0
1  강원 고성군  11.0      6.0      NaN
2  강원 동해시   3.0     11.0      NaN
3  강원 삼척시   5.0     15.0      1.0
4  강원 속초시   4.0     17.0      NaN
       지역    개수  개수_drop  개수_drop  개수_drop
0  강원 강릉시  41.0     39.0      1.0     20.0
1  강원 고성군  11.0      6.0      NaN     17.0
2  강원 동해시   3.0     11.0      NaN      5.0
3  강원 삼척시   5.0     15.0      1.0     28.0
4  강원 속초시   4.0     17.0      NaN      2.0
       지역    개수  개수_drop  개수_drop  개수_drop  개수_drop
0  강원 강릉시  41.0     39.0      1.0     20.0    734.0
1  강원 고성군  11.0      6.0      NaN     17.0    235.0
2  강원 동해시   3.0     11.0      NaN      5.0    114.0
3  강원 삼척시   5.0     15.0      1.0     28.0    181.0
4  강원 속초시   4.0     17.0      NaN      2.0    296.0
       지역    개수  개수_drop  개수_drop  개수_drop  개수_drop  개수_drop
0  강원 강릉시  41.0

Unnamed: 0,박물관 개수,실내 문화공간 개수,휴양림 개수,해변 개수,숙소 개수,아이 동반 쇼핑 시설 개수,주차장 개수,아이 동반 놀이 시설 개수
0,41.0,39.0,1.0,20.0,734.0,98.0,120.0,42.0
1,11.0,6.0,0.0,17.0,235.0,3.0,23.0,8.0
2,3.0,11.0,0.0,5.0,114.0,38.0,24.0,31.0
3,5.0,15.0,1.0,28.0,181.0,18.0,12.0,21.0
4,4.0,17.0,0.0,2.0,296.0,44.0,24.0,25.0
...,...,...,...,...,...,...,...,...
305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# 4

In [5]:
# 종속변수(y) 설정
y = facilities_data['여행건 개수']

# 데이터 전처리 (결측치 처리, 스케일링 등)
X.fillna(X.mean(), inplace=True)

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# LightGBM 회귀 모델 생성 및 학습
lgbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgbm.fit(X_train, y_train)

# 모델 예측
y_pred = lgbm.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 결과 출력
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 248, number of used features: 8
[LightGBM] [Info] Start training from score 44.560484
MAE: 43.88748100878881
MSE: 12314.160614694913
RMSE: 110.96918768151325
R2 Score: 0.5251873967243771


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# 각 독립변수의 중요도 추출
feature_importances = lgbm.feature_importances_

# 중요도를 데이터프레임으로 변환
importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': feature_importances})

# 중요도 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(importance_df)

         Variable  Importance
7  아이 동반 놀이 시설 개수         216
4           숙소 개수         179
6          주차장 개수         117
0          박물관 개수         112
5  아이 동반 쇼핑 시설 개수         106
1      실내 문화공간 개수          58
3           해변 개수          33
2          휴양림 개수          15


# 5

In [6]:
# 종속변수(y) 설정
y = facilities_data['여행건 개수']

# 데이터 전처리 (결측치 처리, 스케일링 등)
X.fillna(X.mean(), inplace=True)

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝을 위한 그리드 설정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 70],
    'boosting_type' : ['gbdt', 'dart', 'goss'],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm = lgb.LGBMRegressor(objective='regression')

# 그리드 서치
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 최적의 하이퍼파라미터로 모델 재학습
lgbm = lgb.LGBMRegressor(**best_params)
lgbm.fit(X_train, y_train)
# 모델 예측
y_pred = lgbm.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 결과 출력
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 248, number of used features: 8
[LightGBM] [Info] Start training from score 44.560484
Best parameters found:  {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 31, 'subsample': 0.8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 248, number of used features: 8
[LightGBM] [Info] Start training from score 44.560484
MAE: 44.22743365635882
MSE: 15093.507120036911
RMSE: 122.85563527993705
R2 Score: 0.4180206323058886


In [7]:
# 각 독립변수의 중요도 추출
feature_importances = lgbm.feature_importances_

# 중요도를 데이터프레임으로 변환
importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': feature_importances})

# 중요도 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(importance_df)

         Variable  Importance
4           숙소 개수         598
7  아이 동반 놀이 시설 개수         515
6          주차장 개수         371
0          박물관 개수         312
1      실내 문화공간 개수         247
5  아이 동반 쇼핑 시설 개수         231
3           해변 개수          71
2          휴양림 개수          64


# 1

In [9]:
# 종속변수(y) 설정
y = facilities_data['여행건 개수']

# 이상치 제거
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X)
mask = yhat != -1
X, y = X[mask], y[mask]

# 데이터 전처리 (결측치 처리, 스케일링 등)
X.fillna(X.mean(), inplace=True)

# 데이터 스케일링
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝을 위한 그리드 설정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 70],
    'boosting_type' : ['gbdt', 'dart', 'goss'],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm = lgb.LGBMRegressor(objective='regression')

# 그리드 서치
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 최적의 하이퍼파라미터로 모델 재학습
lgbm = lgb.LGBMRegressor(**best_params)
lgbm.fit(X_train, y_train)
# 모델 예측
y_pred = lgbm.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 결과 출력
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")



Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 34.569507
Best parameters found:  {'boosting_type': 'dart', 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 34.569507
MAE: 14.0666750

In [10]:
# 각 독립변수의 중요도 추출
feature_importances = lgbm.feature_importances_

# 중요도를 데이터프레임으로 변환
importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': feature_importances})

# 중요도 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(importance_df)

         Variable  Importance
4           숙소 개수         403
7  아이 동반 놀이 시설 개수         345
0          박물관 개수         239
5  아이 동반 쇼핑 시설 개수         162
6          주차장 개수         120
1      실내 문화공간 개수          80
2          휴양림 개수          50
3           해변 개수          30


# 3

In [13]:
# 종속변수(y) 설정
y = facilities_data['여행건 개수']

# 이상치 제거
best_contamination = 0
best_score = -np.inf
for contamination in [0.01, 0.05, 0.1, 0.2]:
    iso = IsolationForest(contamination=contamination, random_state=42)
    yhat = iso.fit_predict(X)
    mask = yhat != -1
    X_filtered, y_filtered = X[mask], y[mask]

    # 데이터 전처리 (결측치 처리, 스케일링 등)
    X_filtered.fillna(X_filtered.mean(), inplace=True)

    # 데이터 스케일링
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X_filtered)

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_filtered, test_size=0.2, random_state=42)

    # 모델 학습
    lgbm = lgb.LGBMRegressor(objective='regression')
    lgbm.fit(X_train, y_train)

    # 모델 예측
    y_pred = lgbm.predict(X_test)

    # 성능 평가
    r2 = r2_score(y_test, y_pred)
    if r2 > best_score:
        best_score = r2
        best_contamination = contamination

print(f"Best contamination: {best_contamination}")
print(f"Best R2 Score: {best_score}")


# 이상치 제거
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X)
mask = yhat != -1
X, y = X[mask], y[mask]

# 데이터 전처리 (결측치 처리, 스케일링 등)
X.fillna(X.mean(), inplace=True)

# 데이터 스케일링
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝을 위한 그리드 설정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 70],
    'boosting_type' : ['gbdt', 'dart', 'goss'],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm = lgb.LGBMRegressor(objective='regression')

# 그리드 서치
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 최적의 하이퍼파라미터로 모델 재학습
lgbm = lgb.LGBMRegressor(**best_params)
lgbm.fit(X_train, y_train)
# 모델 예측
y_pred = lgbm.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 결과 출력
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 286
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 8
[LightGBM] [Info] Start training from score 45.676230


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 8
[LightGBM] [Info] Start training from score 37.140426


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 252
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 33.376682


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 198, number of used features: 8
[LightGBM] [Info] Start training from score 29.893939
Best contamination: 0.2
Best R2 Score: 0.5556030967142396
Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 248
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 31.551570
Best parameters found:  {'boosting_type': 'dart', 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 0.8}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 248
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 31.551570
MAE: 19.3642899

In [14]:
# 각 독립변수의 중요도 추출
feature_importances = lgbm.feature_importances_

# 중요도를 데이터프레임으로 변환
importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': feature_importances})

# 중요도 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(importance_df)

         Variable  Importance
4           숙소 개수         178
7  아이 동반 놀이 시설 개수         178
6          주차장 개수          84
0          박물관 개수          77
1      실내 문화공간 개수          71
5  아이 동반 쇼핑 시설 개수          66
2          휴양림 개수          16
3           해변 개수          16


# 2

In [16]:
# 종속변수(y) 설정
y = facilities_data['여행건 개수']

# 이상치 제거
best_contamination = 0
best_score = -np.inf
for contamination in [0.01, 0.05, 0.1, 0.2]:
    iso = IsolationForest(contamination=contamination, random_state=42)
    yhat = iso.fit_predict(X)
    mask = yhat != -1
    X_filtered, y_filtered = X[mask], y[mask]

    # 데이터 전처리 (결측치 처리, 스케일링 등)
    X_filtered.fillna(X_filtered.mean(), inplace=True)

    # 데이터 스케일링
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_filtered)

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_filtered, test_size=0.2, random_state=42)

    # 모델 학습
    lgbm = lgb.LGBMRegressor(objective='regression')
    lgbm.fit(X_train, y_train)

    # 모델 예측
    y_pred = lgbm.predict(X_test)

    # 성능 평가
    r2 = r2_score(y_test, y_pred)
    if r2 > best_score:
        best_score = r2
        best_contamination = contamination

print(f"Best contamination: {best_contamination}")
print(f"Best R2 Score: {best_score}")


# 이상치 제거
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X)
mask = yhat != -1
X, y = X[mask], y[mask]

# 데이터 전처리 (결측치 처리, 스케일링 등)
X.fillna(X.mean(), inplace=True)

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 튜닝을 위한 그리드 설정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 70],
    'boosting_type' : ['gbdt', 'dart', 'goss'],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm = lgb.LGBMRegressor(objective='regression')

# 그리드 서치
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 최적의 하이퍼파라미터로 모델 재학습
lgbm = lgb.LGBMRegressor(**best_params)
lgbm.fit(X_train, y_train)
# 모델 예측
y_pred = lgbm.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 결과 출력
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 8
[LightGBM] [Info] Start training from score 45.676230


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 235, number of used features: 8
[LightGBM] [Info] Start training from score 37.140426


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 33.376682


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_filtered.fillna(X_filtered.mean(), inplace=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 230
[LightGBM] [Info] Number of data points in the train set: 198, number of used features: 8
[LightGBM] [Info] Start training from score 29.893939
Best contamination: 0.2
Best R2 Score: 0.5844607543753917


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 32.852018
Best parameters found:  {'boosting_type': 'dart', 'colsample_bytree': 0.8, 'learning_rate': 0.05, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 8
[LightGBM] [Info] Start training from score 32.852018
MAE: 17.718402995403075
MSE: 1170.5348108695773
RMSE: 34.21307952917389
R2 Sc

In [17]:
# 각 독립변수의 중요도 추출
feature_importances = lgbm.feature_importances_

# 중요도를 데이터프레임으로 변환
importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': feature_importances})

# 중요도 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(importance_df)

         Variable  Importance
4           숙소 개수         461
7  아이 동반 놀이 시설 개수         248
0          박물관 개수         194
1      실내 문화공간 개수         157
5  아이 동반 쇼핑 시설 개수         151
6          주차장 개수         137
2          휴양림 개수          50
3           해변 개수          49
