In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/competition/2022-AI-competition-Round1") # Local Path
os.listdir()

Mounted at /content/drive


['competition_data',
 'html',
 'submission',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 'catboost_info',
 'evaluator',
 '.git',
 '.gitignore',
 'README.md',
 'GitHub Connection.ipynb',
 '2022-08-11 Models(XGB_ET) Optimization.ipynb',
 'Updated CLF.ipynb']

In [2]:
!pip install -r evaluator/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.3 MB/s 
[?25hCollecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 54.6 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 48.1 MB/s 
[?25hCollecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.8 MB/s 
Collecting Mako
  Downloading Mako-1.2.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.1 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl

In [3]:
from evaluator.evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from collections import defaultdict
import pandas as pd
import numpy as np
# from tqdm.notebook import tqdm

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import (
    RandomForestClassifier, 
    RandomForestRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
)
from sklearn.impute import SimpleImputer
import optuna

class Model():
  def __init__(self, train_df, model_name="lgbm", model_type="clf", **params):
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    self.model = None
    if model_name == "lgbm":
      if model_type == "clf":
        self.model = LGBMClassifier(**params)
      elif model_type == "rgr":
        self.model = LGBMRegressor(**params)
    elif model_name == "xgb":
      if model_type == "clf":
        self.model = XGBClassifier(**params)
      elif model_type == "rgr":
        self.model = XGBRegressor(**params)
    elif model_name == "et":
      if model_type == "clf":
        self.model = ExtraTreesClassifier(**params)
      elif model_type == "rgr":
        self.model = ExtraTreesRegressor(**params)
    elif model_name == "rf":
      if model_type == "clf":
        self.model = RandomForestClassifier(**params)
      elif model_type == "rgr":
        self.model = RandomForestRegressor(**params)
    elif model_name == "cat":
      if model_type == "clf":
        self.model = CatBoostClassifier(**params)
      elif model_type == "rgr":
        self.model = CatBoostRegressor(**params)
    else:
      raise NameError("model_name must be in ('lgbm', 'xgb', 'rf', 'et', 'cat')")
    
    if self.model is None:
      raise NameError("model_type must be in ('clf', 'rgr')")

  def get_model(self):
    return {
        "train_df": self.train_df,
        "model": self.model,
        "model_name": self.model_name,
        "model_type": self.model_type,
    }

  def optimize(self, initial_params, **kwargs):
    self.optimizer = Optimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class Optimizer():
  def __init__(self, train_df, initial_params, model_name, model_type, random_seed=42):
    self.params = {}
    self.train_df = train_df
    self.initial_params = initial_params
    self.model_name = model_name
    self.model_type = model_type
    self.random_seed = random_seed

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = Evaluator(
        **Model(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).run(train_acc=False)

    return result_df["roc_auc"]["mean"]

  def optimize(self, n_trials=100, sampling="TPE"):
    if sampling == "random":
      sampler = optuna.samplers.RandomSampler(seed=self.random_seed)
    elif sampling == "TPE":
      sampler = optuna.samplers.TPESampler()

    self.opt = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    self.opt.optimize(self.objective, n_trials=n_trials)

  def analyze(self):
    optuna.visualization.plot_param_importances(self.opt)
    optuna.visualization.plot_optimization_history(self.opt)
    optuna.visualization.plot_slice(self.opt)

  def best_params(self):
    print(self.opt.best_trial.value)
    print(self.opt.best_trial.params)
    return self.opt.best_trial.params

  def run(self, **kwargs):
    self.optimize(**kwargs)
    self.analyze()
    return self.best_params()

class Evaluator():
  def __init__(self, model, train_df, n_folds=4, random_state=42, model_name=None, model_type="clf"):
    self.kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    self.model = model
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    if self.model_name == 'cat':
      self.fit_params = {'silent': True}
    else:
      self.fit_params = {}

  def drop_col(self, df, col_list=["index", "country"]):
    return df.drop(col_list, axis=1)

  def index_col(self, df, col_list=["country"]):
    def _indexer(col):
      col_list = list(set(df[col]))
      col_map = {
          c: i 
          for i, c in enumerate(col_list)
      }
      return df[col].apply(lambda x: col_map[x])

    for col in col_list:
      df[f"{col}_idx"] = _indexer(col)
    return self.drop_col(df, col_list)

  def preprocess(self, df=None, mode="index"):
    assert mode in ("index", "drop")

    df = self.train_df if df is None else df

    if mode == "drop":
      return self.drop_col(df)
    elif mode == "index":
      df = self.index_col(df)
      return self.drop_col(df, col_list=["index"])

  def evaluate(self, metrics="all", train_acc=True):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    class_metrics = {
        "accuracy",
        "precision",
        "recall",
        "f1-score",
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in enumerate(self.kf.split(train_x)):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train, **self.fit_params)
      if self.model_type == 'rgr':
        predictions = self.model.predict(X_test)
      else:
        predictions = self.model.predict_proba(X_test)[:,1]

      row = {}
      for metric in metrics:
        if metric in class_metrics:
          score = metrics_functions_map[metric](
              y_test,
              np.round(predictions)
          )
        else:
          score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y, **self.fit_params)
    if self.model_type == 'rgr':
      predictions = self.model.predict(train_x)
    else:
      predictions = self.model.predict_proba(train_x)[:,1]
    mean["train_acc"] = accuracy_score(np.round(predictions), train_y)
    result_df.loc["mean"] = mean

    return result_df

  def run(self, **kwargs):
    self.train_df = self.preprocess(self.train_df)
    self.train_df = self.train_df.dropna()
    self.train_df = self.train_df.reset_index()
    self.train_df = self.train_df.drop(["index"], axis=1)
    return self.evaluate(**kwargs)

  def make_submission(self, test_df, submission_df):
    test_df = self.preprocess(test_df)

    # handle nan values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp = imp.fit(test_df)
    test_df = imp.transform(test_df)

    
    if self.model_type == 'rgr':
      preds = self.model.predict(test_df)
    else:
      preds = self.model.predict_proba(test_df)[:,1]
    submission_df["nerdiness"] = preds
    return submission_df

### ET CLF Optimize


### Categorical Parameters

In [7]:
evaluator = Evaluator(
    **Model(train_df, "et", "clf").get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.789934,0.797354,0.839087,0.817689,0.87634,0.30081,
2,0.782745,0.774346,0.848537,0.809745,0.873176,0.307618,
3,0.771178,0.769152,0.840922,0.803437,0.866671,0.311885,
4,0.786183,0.803768,0.83252,0.817891,0.869712,0.30447,
mean,0.78251,0.786155,0.840267,0.812191,0.871475,0.306196,1.0


In [None]:
params = {
  "criterion": "entropy"
}
evaluator = Evaluator(
    **Model(train_df, "et", "clf", **params).get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.784308,0.789226,0.8402,0.813916,0.871601,0.302241,
2,0.78337,0.773153,0.852553,0.810914,0.877623,0.305186,
3,0.771491,0.768718,0.842608,0.803969,0.866197,0.309934,
4,0.787746,0.803646,0.836314,0.819655,0.86989,0.30367,
mean,0.781729,0.783686,0.842919,0.812113,0.871328,0.305258,1.0


In [None]:
params = {
  "criterion": "gini"
}
evaluator = Evaluator(
    **Model(train_df, "et", "clf", **params).get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.784308,0.793213,0.832962,0.812602,0.872077,0.302998,
2,0.781494,0.772158,0.849684,0.809069,0.876486,0.305395,
3,0.778368,0.775206,0.847105,0.809562,0.868618,0.311088,
4,0.786496,0.804507,0.831978,0.818012,0.869114,0.30493,
mean,0.782666,0.786271,0.840433,0.812311,0.871574,0.306103,1.0


### Numerical Features

In [None]:
model = Model(train_df, "et", "clf")

In [None]:
initial_params = (
    ("n_estimators", "int", (50, 300)),
    ("max_depth", "int", (10, 100)),
    ("min_samples_split", "int", (2, 10)),
    ("min_samples_leaf", "int", (1, 3)),
    ("min_weight_fraction_leaf", "static", 0.0),
    ("max_features", "float", (0.7, 1.0)),

)
model.optimize(initial_params, n_trials=100)

[32m[I 2022-08-15 06:40:41,286][0m A new study created in memory with name: no-name-7ea09898-aebf-4cae-874e-6e6de0aedaf3[0m
[32m[I 2022-08-15 06:42:12,443][0m Trial 0 finished with value: 0.8683407754589223 and parameters: {'n_estimators': 250, 'max_depth': 47, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.8228134634718974}. Best is trial 0 with value: 0.8683407754589223.[0m
[32m[I 2022-08-15 06:43:01,625][0m Trial 1 finished with value: 0.8526910369980458 and parameters: {'n_estimators': 172, 'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.759497307057069}. Best is trial 0 with value: 0.8683407754589223.[0m
[32m[I 2022-08-15 06:44:11,481][0m Trial 2 finished with value: 0.8703565062567571 and parameters: {'n_estimators': 194, 'max_depth': 79, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.7650291215948101}. Best is trial 2 with value: 0.8703565062567571.[0m
[32m[I 2022-08-15 06:44:32,037][0m Trial 3 f

0.8720455835807477
{'n_estimators': 292, 'max_depth': 45, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.729627795893577}


In [None]:
evaluator = Evaluator(
    **model.get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.786183,0.795117,0.834076,0.81413,0.871533,0.293196,
2,0.78337,0.779851,0.839357,0.808511,0.876941,0.295754,
3,0.772429,0.776433,0.82968,0.802174,0.86772,0.301745,
4,0.785558,0.80484,0.829268,0.816871,0.870399,0.295271,
mean,0.781885,0.78906,0.833095,0.810422,0.871648,0.296491,1.0


In [15]:
params = {'n_estimators': 292, 'max_depth': 45, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.729627795893577}
evaluator = Evaluator(
    **Model(train_df, "et", "clf", **params).get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.787434,0.797441,0.832962,0.814815,0.871022,0.293523,
2,0.785871,0.781683,0.842226,0.810826,0.878784,0.294631,
3,0.778368,0.780105,0.837549,0.807807,0.869264,0.301164,
4,0.787746,0.806842,0.830894,0.818692,0.867817,0.296641,
mean,0.784855,0.791518,0.835908,0.813035,0.871722,0.29649,1.0


In [16]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("submission/2022-08-15_ET_opt_v0_1.csv", index=False)

In [None]:
import optuna
optuna.visualization.plot_param_importances(model.optimizer.opt)

In [None]:
optuna.visualization.plot_optimization_history(model.optimizer.opt)

In [None]:
optuna.visualization.plot_slice(model.optimizer.opt)