### Import & Setting

In [None]:
import os
import random
import numpy as np
import pandas as pd
import json

import optuna
from tqdm import tqdm
from lightgbm import LGBMRegressor
from optuna.samplers import TPESampler
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

import warnings
warnings.filterwarnings('ignore')

def MAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    return mae

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_value = 42
seed_everything(seed_value)

In [None]:
dataset = pd.read_csv("dataset.csv")

### 범주형 변수 수치화

In [None]:
qual_col = ["법정동"]

for i in qual_col:
  le = LabelEncoder()
  le = le.fit(dataset[i])
  dataset[i] = le.transform(dataset[i])

### GridSearchCV로 넓은 구간의 하이퍼파라미터 탐색

In [None]:
def wide_range_tuning(x, y):
  model = LGBMRegressor()
  param_grid = [{'n_estimators' : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}]
  grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
  grid_search.fit(x, y)
  best_n = grid_search.best_params_['n_estimators']
  return best_n

### Kfold + Optuna로 최적 하이퍼파라미터 도출

In [None]:
def short_range_tuning(x, y):
  sampler = TPESampler(seed=seed_value)
  kfold = KFold(n_splits=5, shuffle=True, random_state=seed_value)

  n_iter=0
  for train_index, test_index in kfold.split(x, y):
      n_iter += 1

      train_x = x.iloc[train_index]
      valid_x = x.iloc[test_index]
      train_y = y.iloc[train_index]
      valid_y = y.iloc[test_index]

      def objective(trial):
        param = {
          'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1),
          'n_estimators': trial.suggest_int('n_estimators', best_n - 100, best_n + 100),
          'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
          'subsample' :trial.suggest_float('subsample', 0.5, 1.0),
          'reg_alpha' : trial.suggest_float('reg_alpha', 0.0, 5.0),
          'reg_lambda' : trial.suggest_float('reg_lambda', 0.0, 5.0),
          'n_jobs' : -1}

        model = LGBMRegressor(**param)
        model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)])

        y_valid_pred = model.predict(valid_x)
        return MAE(valid_y, y_valid_pred)

      study = optuna.create_study(direction='minimize', sampler=sampler)
      study.optimize(objective, n_trials=10)
      best_param_dict = study.best_trial.params

  return best_param_dict

### 변수 중요도 획득

In [None]:
def get_importances(x, y, best_param_dict):
  params={'learning_rate': best_param_dict['learning_rate'],
          'n_estimators': best_param_dict['n_estimators'],
          'colsample_bytree': best_param_dict['colsample_bytree'],
          'subsample': best_param_dict['subsample'],
          'reg_alpha': best_param_dict['reg_alpha'],
          'reg_lambda': best_param_dict['reg_lambda']}

  model = LGBMRegressor(**params)
  model.fit(x, y)
  result = permutation_importance(model, x, y, n_repeats=10, random_state=seed_value)
  return result.importances_mean

### 실행

In [None]:
importances_dict = {}

for i in tqdm(dataset['구'].unique()):
  partition = dataset[dataset["구"] == i]
  x = partition.drop(columns = ["구" , "면적당가격(만원)"])
  y = partition["면적당가격(만원)"]

  best_n = wide_range_tuning(x, y)
  best_param_dict = short_range_tuning(x, y)
  importances = get_importances(x, y, best_param_dict)

  importances_dict[i] = list(importances)

### 중요도 dictionary 저장

In [None]:
with open('importances_dict.json', 'w') as json_file:
    json.dump(importances_dict, json_file)