## Initial Settings

In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/competition/2022-AI-competition-Round1") # Local Path
os.listdir()

Mounted at /content/drive


['competition_data',
 'submission',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 'catboost_info',
 'evaluator',
 '.git',
 '.gitignore',
 'README.md',
 'Updated CLF.ipynb',
 '2022-08-17 Ensemble Module.ipynb',
 '2022-08-17 Evaluator Optuna (v0.4).ipynb',
 'GitHub Connection.ipynb',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-11 Models(XGB_ET) Optimization.ipynb']

In [2]:
!pip install -r evaluator/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
[?25hCollecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 47.2 MB/s 
Collecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 39.0 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.2 MB/s 
Collecting Mako
  Downloading Mako-1.2.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.7 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl

In [3]:
from evaluator.evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")

In [None]:
from evaluator.evaluator import Model, Evaluator, Optimizer
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)

class MyEvaluator(Evaluator):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def evaluate(self, metrics="all", train_acc=True):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    class_metrics = {
        "accuracy",
        "precision",
        "recall",
        "f1-score",
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in enumerate(self.kf.split(train_x)):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train, **self.fit_params)
      if self.model_type == 'rgr':
        predictions = self.model.predict(X_test)
      else:
        predictions = self.model.predict_proba(X_test)[:,1]

      row = {}
      for metric in metrics:
        if metric in class_metrics:
          score = metrics_functions_map[metric](
              y_test,
              np.array(predictions) > 0.5,
          )
        else:
          score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y, **self.fit_params)
    if self.model_type == 'rgr':
      predictions = self.model.predict(train_x)
    else:
      predictions = self.model.predict_proba(train_x)[:,1]
    mean["train_acc"] = accuracy_score(
      np.array(predictions) > 0.5, 
      train_y,
    )
    result_df.loc["mean"] = mean

    return result_df

  def make_submission(self, test_df, submission_df):
    test_df = self.preprocess(test_df)

    # handle nan values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp = imp.fit(test_df)
    test_df = pd.DataFrame(
        imp.transform(test_df),
        columns = test_df.columns
    )
    
    preds = self.model.predict(test_df)
    submission_df["nerdiness"] = preds
    return submission_df


class MyModel(Model):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def optimize(self, initial_params, **kwargs):
    self.optimizer = MyOptimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class MyOptimizer(Optimizer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = MyEvaluator(
        **MyModel(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).run(train_acc=False)

    return result_df["roc_auc"]["mean"]

### XGB, ET Baseline

In [5]:
model = Model(train_df, "xgb", "rgr")

In [6]:
evaluator = Evaluator(
    **model.get_model()
)
evaluator.run()



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.728665,0.733635,0.811247,0.770492,0.800725,0.36532,
2,0.733667,0.725798,0.821572,0.770721,0.808662,0.364017,
3,0.73648,0.738532,0.814503,0.774659,0.804218,0.367408,
4,0.735855,0.756937,0.798374,0.777104,0.80002,0.366307,
mean,0.733667,0.738726,0.811424,0.773244,0.803406,0.365763,0.757971


In [None]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("submission/2022-08-17_XGB_base.csv", index=False)

In [None]:
params={
    "criterion": "gini",
    "random_state": 42
}
evaluator = Evaluator(
    **Model(train_df, "et", "clf").get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.788997,0.795155,0.840757,0.817321,0.87226,0.303267,
2,0.781182,0.769788,0.853701,0.809576,0.876518,0.30508,
3,0.77493,0.771678,0.845419,0.806867,0.86526,0.312729,
4,0.787434,0.805454,0.83252,0.818763,0.872283,0.303807,
mean,0.783135,0.785519,0.843099,0.813132,0.87158,0.306221,1.0


### Optuna Tuning

In [7]:
model = Model(train_df, "xgb", "rgr")
initial_params = (
    ("n_estimators", "int", (100, 1000)),
    ("objective", "static", "reg:squarederror"),
    # ("average", "static", "micro"),
    ("learning_rate", "log", (1e-3, 1.0)),
    ("gamma", "int", (0, 5)),
    ("max_depth", "int", (5, 20)),
    ("lambda", "int", (0, 5)),
    ("alpha", "int", (0, 5)),
    ("subsample", "float", (0.7, 1))
)
model.optimize(initial_params, n_trials=50)

[32m[I 2022-08-20 15:47:06,893][0m A new study created in memory with name: no-name-5309fc56-5f46-492e-bba5-0f87803d47f0[0m
[32m[I 2022-08-20 15:48:42,238][0m Trial 0 finished with value: 0.7843189991733508 and parameters: {'n_estimators': 473, 'learning_rate': 0.7326520561624801, 'gamma': 0, 'max_depth': 6, 'lambda': 0, 'alpha': 4, 'subsample': 0.8476564536114375}. Best is trial 0 with value: 0.7843189991733508.[0m
[32m[I 2022-08-20 15:53:11,046][0m Trial 1 finished with value: 0.7947032027862757 and parameters: {'n_estimators': 804, 'learning_rate': 0.20656298507512771, 'gamma': 5, 'max_depth': 9, 'lambda': 2, 'alpha': 3, 'subsample': 0.9722235635546708}. Best is trial 1 with value: 0.7947032027862757.[0m
[32m[I 2022-08-20 16:07:20,778][0m Trial 2 finished with value: 0.7861252932119043 and parameters: {'n_estimators': 906, 'learning_rate': 0.001041157480109287, 'gamma': 5, 'max_depth': 19, 'lambda': 0, 'alpha': 1, 'subsample': 0.8037643152342789}. Best is trial 1 with val

0.8669253333249242
{'n_estimators': 291, 'learning_rate': 0.0289556362173653, 'gamma': 0, 'max_depth': 13, 'lambda': 3, 'alpha': 4, 'subsample': 0.8350807708042902}


In [None]:
evaluator = Evaluator(
    **model.get_model()
)
evaluator.run()

In [5]:
params = {
    # "objective": "reg:squarederror",
    # "average": "micro",
    'n_estimators': 746, 
    'learning_rate': 0.0181863245309935, 
    'gamma': 0, 
    'max_depth': 11, 
    'lambda': 1, 
    'alpha': 4, 
    'subsample': 0.7231465475519483
}
evaluator = Evaluator(
    **Model(train_df, "xgb", "rgr", **params).get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.780556,0.789725,0.830178,0.809446,0.869676,0.289223,
2,0.786183,0.7812,0.843947,0.811362,0.874554,0.290077,
3,0.77493,0.780307,0.828555,0.803708,0.867293,0.296335,
4,0.779619,0.797185,0.828726,0.812649,0.864856,0.292082,
mean,0.780322,0.787104,0.832852,0.809291,0.869095,0.291929,1.0


In [None]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("submission/2022-08-11_XGB_optim_20.csv", index=False)

In [4]:
params = {
    'n_estimators': 291, 
    'learning_rate': 0.0289556362173653, 
    'gamma': 0, 
    'max_depth': 13, 
    'lambda': 3, 
    'alpha': 4, 
    'subsample': 0.8350807708042902
}
evaluator = Evaluator(
    **Model(train_df, "xgb", "rgr", **params).get_model()
)
evaluator.run()



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.785245,0.791689,0.837973,0.814174,0.872538,0.28743,
2,0.782119,0.77938,0.837063,0.807192,0.872962,0.291506,
3,0.771491,0.777249,0.825745,0.800763,0.861144,0.299171,
4,0.770241,0.793962,0.812466,0.803107,0.861056,0.293649,
mean,0.777274,0.78557,0.828312,0.806309,0.866925,0.292939,1.0
