In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/competition/2022-AI-competition-Round1") # Local Path
os.listdir()

Mounted at /content/drive


['competition_data',
 'html',
 'submission',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 'catboost_info',
 'evaluator',
 '.git',
 '.gitignore',
 'README.md',
 'Updated CLF.ipynb',
 '2022-08-11 Models(XGB_ET) Optimization.ipynb',
 'GitHub Connection.ipynb',
 '2022-08-17 Evaluator Optuna (v0.4).ipynb']

In [2]:
!pip install -r evaluator/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.4 MB/s 
[?25hCollecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 50.0 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 71.3 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.5 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.2.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.6 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl

In [3]:
from evaluator.evaluator import Evaluator, Model, Optimizer

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")

In [4]:
import optuna

class MyEvaluator(Evaluator):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def train_best_model(self, n_runs=10, metric='roc_auc', threshold=None):
    best_model = None
    best_result = None

    for i in range(n_runs):
      result_df = self.run()
      result_metric = result_df['roc_auc']['mean']

      if best_result is None or \
        best_result['roc_auc']['mean'] < result_metric:
          best_model = self.model
          best_result = result_df

      if threshold is not None and result_metric > threshold:
        break

    self.model = best_model
    return best_result

class MyModel(Model):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def optimize(self, initial_params, **kwargs):
    self.optimizer = MyOptimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class MyOptimizer(Optimizer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, *value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = MyEvaluator(
        **MyModel(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).train_best_model(n_runs=self.n_runs, train_acc=False)

    return result_df["roc_auc"]["mean"]

  def optimize(self, n_trials=100, n_runs=1, sampling="TPE"):
    if sampling == "random":
      sampler = optuna.samplers.RandomSampler(seed=self.random_seed)
    elif sampling == "TPE":
      sampler = optuna.samplers.TPESampler()

    self.n_runs = n_runs
    self.opt = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    self.opt.optimize(self.objective, n_trials=n_trials)

  def analyze(self):
    optuna.visualization.plot_optimization_history(self.opt).show()
    optuna.visualization.plot_param_importances(self.opt).show()
    optuna.visualization.plot_slice(self.opt).show()

## Analysis Visualization

In [15]:
MyEvaluator(
    **MyModel(train_df, "lgbm", "rgr").get_model()
).run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.751797,0.756135,0.823497,0.78838,0.821977,0.346526,
2,0.745858,0.740683,0.820998,0.778776,0.824307,0.34785,
3,0.751172,0.75322,0.82181,0.786022,0.822538,0.348748,
4,0.751485,0.774295,0.803252,0.788508,0.819068,0.345122,
mean,0.750078,0.756083,0.817389,0.785421,0.821972,0.347061,0.851985


In [39]:
model = MyModel(train_df, "lgbm", "rgr")

In [40]:
initial_params = (
    ("n_estimators", "static", 10),
    ("objective", "static", "binary"),
    ("metric", "static", "auc"),
    ("learning_rate", "log", (1e-5, 1.0)),
    ("num_leaves", "int", (300, 2000)),
)
model.optimize(initial_params, n_trials=20)

[32m[I 2022-08-17 02:48:07,941][0m A new study created in memory with name: no-name-ae0d6667-5e17-4493-9fad-34a7571b76ef[0m
[32m[I 2022-08-17 02:48:10,017][0m Trial 0 finished with value: 0.7503224652840883 and parameters: {'learning_rate': 5.730796895859114e-05, 'num_leaves': 839}. Best is trial 0 with value: 0.7503224652840883.[0m
[32m[I 2022-08-17 02:48:12,269][0m Trial 1 finished with value: 0.7562871890326607 and parameters: {'learning_rate': 0.002630030270899752, 'num_leaves': 1983}. Best is trial 1 with value: 0.7562871890326607.[0m
[32m[I 2022-08-17 02:48:14,442][0m Trial 2 finished with value: 0.7522946393278392 and parameters: {'learning_rate': 0.000936628359359168, 'num_leaves': 1696}. Best is trial 1 with value: 0.7562871890326607.[0m
[32m[I 2022-08-17 02:48:16,511][0m Trial 3 finished with value: 0.7549935087869313 and parameters: {'learning_rate': 0.002119806784447664, 'num_leaves': 922}. Best is trial 1 with value: 0.7562871890326607.[0m
[32m[I 2022-08-17

0.8238294506024535
{'learning_rate': 0.24036036587097243, 'num_leaves': 665}


## Optimize random splitter - n_runs

In [15]:
import optuna

class MyEvaluator(Evaluator):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def train_best_model(self, n_runs=10, metric='roc_auc', threshold=None, **kwargs):
    best_model = None
    best_result = None

    self.train_df = self.preprocess(self.train_df)
    self.train_df = self.train_df.dropna()
    self.train_df = self.train_df.reset_index()
    self.train_df = self.train_df.drop(["index"], axis=1)

    for i in range(n_runs):
      result_df = self.evaluate(**kwargs)
      result_metric = result_df['roc_auc']['mean']

      if best_result is None or \
        best_result['roc_auc']['mean'] < result_metric:
          best_model = self.model
          best_result = result_df

      if threshold is not None and result_metric > threshold:
        break

    self.model = best_model
    return best_result

class MyModel(Model):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def optimize(self, initial_params, **kwargs):
    self.optimizer = MyOptimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class MyOptimizer(Optimizer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, *value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = MyEvaluator(
        **MyModel(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).train_best_model(n_runs=self.n_runs, train_acc=False)

    return result_df["roc_auc"]["mean"]

  def optimize(self, n_trials=100, n_runs=1, sampling="TPE"):
    if sampling == "random":
      sampler = optuna.samplers.RandomSampler(seed=self.random_seed)
    elif sampling == "TPE":
      sampler = optuna.samplers.TPESampler()

    self.n_runs = n_runs
    self.opt = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    self.opt.optimize(self.objective, n_trials=n_trials)

  def analyze(self):
    optuna.visualization.plot_optimization_history(self.opt).show()
    optuna.visualization.plot_param_importances(self.opt).show()
    optuna.visualization.plot_slice(self.opt).show()

In [11]:
evaluator = MyEvaluator(
    **MyModel(train_df, "et", "clf").get_model()
)
evaluator.train_best_model()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.786183,0.792632,0.83853,0.814935,0.872812,0.30261,
2,0.774304,0.766858,0.841652,0.802516,0.876282,0.305123,
3,0.774617,0.774378,0.839236,0.805503,0.866748,0.312416,
4,0.787121,0.806639,0.82981,0.81806,0.872504,0.30356,
mean,0.780556,0.785127,0.837307,0.810254,0.872086,0.305928,1.0


In [12]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("submission/2022-08-17_ET_base_runs_10.csv", index=False)

In [16]:
model = MyModel(train_df, "et", "clf")

In [18]:
initial_params = (
    ("n_estimators", "static", 10),
    ("max_depth", "int", (10, 50)),
    ("max_features", "float", (0.7, 1)),
)
model.optimize(initial_params, n_trials=20, n_runs=2)

[32m[I 2022-08-17 05:32:35,406][0m A new study created in memory with name: no-name-e846c2c5-690e-4820-9c26-23fda71ee012[0m
[32m[I 2022-08-17 05:32:53,588][0m Trial 0 finished with value: 0.843071584710668 and parameters: {'max_depth': 36, 'max_features': 0.9507382325858209}. Best is trial 0 with value: 0.843071584710668.[0m
[32m[I 2022-08-17 05:33:01,962][0m Trial 1 finished with value: 0.8427028444421134 and parameters: {'max_depth': 41, 'max_features': 0.8165687864621469}. Best is trial 0 with value: 0.843071584710668.[0m
[32m[I 2022-08-17 05:33:11,096][0m Trial 2 finished with value: 0.8452345158393351 and parameters: {'max_depth': 40, 'max_features': 0.920006436623635}. Best is trial 2 with value: 0.8452345158393351.[0m
[32m[I 2022-08-17 05:33:18,865][0m Trial 3 finished with value: 0.8436726551808715 and parameters: {'max_depth': 27, 'max_features': 0.7579805104653823}. Best is trial 2 with value: 0.8452345158393351.[0m
[32m[I 2022-08-17 05:33:26,771][0m Trial 4 

0.8464799092948361
{'max_depth': 19, 'max_features': 0.7773482273161983}
