In [2]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/competition/2022-AI-competition-Round1")
os.listdir()

Mounted at /content/drive


['2022-08-09 Data Preprocessing.ipynb',
 'competition_data',
 'html',
 'submission',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 'catboost_info',
 'evaluator',
 '.git',
 '2022-08-11 Models(XGB_ET) Optimization.ipynb',
 '.gitignore',
 'README.md',
 'GitHub Connection.ipynb']

In [None]:
!pip install -r evaluator/requirements.txt

In [None]:
!git status

In [None]:
!git status

On branch add-normalization
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   catboost_info/catboost_training.json[m
	[31mmodified:   catboost_info/learn/events.out.tfevents[m
	[31mmodified:   catboost_info/learn_error.tsv[m
	[31mmodified:   catboost_info/time_left.tsv[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mevaluator/__pycache__/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
from evaluator.evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from collections import defaultdict
import pandas as pd
import numpy as np
# from tqdm.notebook import tqdm

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import (
    RandomForestClassifier, 
    RandomForestRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
)
from sklearn.impute import SimpleImputer
import optuna

class Model():
  def __init__(self, train_df, model_name="lgbm", model_type="clf", **params):
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    self.model = None
    if model_name == "lgbm":
      if model_type == "clf":
        self.model = LGBMClassifier(**params)
      elif model_type == "rgr":
        self.model = LGBMRegressor(**params)
    elif model_name == "xgb":
      if model_type == "clf":
        self.model = XGBClassifier(**params)
      elif model_type == "rgr":
        self.model = XGBRegressor(**params)
    elif model_name == "et":
      if model_type == "clf":
        self.model = ExtraTreesClassifier(**params)
      elif model_type == "rgr":
        self.model = ExtraTreesRegressor(**params)
    elif model_name == "rf":
      if model_type == "clf":
        self.model = RandomForestClassifier(**params)
      elif model_type == "rgr":
        self.model = RandomForestRegressor(**params)
    elif model_name == "cat":
      if model_type == "clf":
        self.model = CatBoostClassifier(**params)
      elif model_type == "rgr":
        self.model = CatBoostRegressor(**params)
    else:
      raise NameError("model_name must be in ('lgbm', 'xgb', 'rf', 'et', 'cat')")
    
    if self.model is None:
      raise NameError("model_type must be in ('clf', 'rgr')")

  def get_model(self):
    return {
        "train_df": self.train_df,
        "model": self.model,
        "model_name": self.model_name,
        "model_type": self.model_type,
    }

  def optimize(self, initial_params, prep_args={}, **kwargs):
    self.optimizer = Optimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
        prep_args,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class Optimizer():
  def __init__(
      self, 
      train_df, 
      initial_params, 
      model_name, 
      model_type, 
      prep_args={}, 
      random_seed=42
  ):
    self.params = {}
    self.train_df = train_df
    self.initial_params = initial_params
    self.model_name = model_name
    self.model_type = model_type
    self.prep_args = prep_args
    self.random_seed = random_seed

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, *value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = Evaluator(
        **Model(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).run(train_acc=False, **self.prep_args)

    return result_df["roc_auc"]["mean"]

  def optimize(self, n_trials=100, sampling="TPE"):
    if sampling == "random":
      sampler = optuna.samplers.RandomSampler(seed=self.random_seed)
    elif sampling == "TPE":
      sampler = optuna.samplers.TPESampler()

    self.opt = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    self.opt.optimize(self.objective, n_trials=n_trials)

  def analyze(self):
    optuna.visualization.plot_param_importances(self.opt)
    optuna.visualization.plot_optimization_history(self.opt)
    optuna.visualization.plot_slice(self.opt)

  def best_params(self):
    print(self.opt.best_trial.value)
    print(self.opt.best_trial.params)
    return self.opt.best_trial.params

  def run(self, prep_args, **kwargs):
    self.optimize(**kwargs)
    self.analyze()
    return self.best_params()

class Evaluator():
  def __init__(self, model, train_df, n_folds=4, random_state=42, model_name=None, model_type="clf"):
    self.kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    self.model = model
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    if self.model_name == 'cat':
      self.fit_params = {'silent': True}
    else:
      self.fit_params = {}

  def drop_col(self, df, col_list=["index", "country"]):
    return df.drop(col_list, axis=1)

  def index_col(self, df, col_list=["country"]):
    def _indexer(col):
      col_list = list(set(df[col]))
      col_map = {
          c: i 
          for i, c in enumerate(col_list)
      }
      return df[col].apply(lambda x: col_map[x])

    for col in col_list:
      df[f"{col}_idx"] = _indexer(col)
    return self.drop_col(df, col_list)

  def preprocess(self, df=None, mode="index", normalize=False):
    assert mode in ("index", "drop")

    df = self.train_df if df is None else df

    if normalize:
      df = df[
          (df["age"] < 100) & \
          (df["familysize"] < 10) & \
          (df["testelapse"] < 500) & \
          (df["introelapse"] < 200) & \
          (df["surveyelapse"] < 1000)
      ]
      df = (df - df.min()) / (df.max() - df.min())

    if mode == "drop":
      return self.drop_col(df)
    elif mode == "index":
      df = self.index_col(df)
      return self.drop_col(df, col_list=["index"])

  def evaluate(self, metrics="all", train_acc=True):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    class_metrics = {
        "accuracy",
        "precision",
        "recall",
        "f1-score",
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in enumerate(self.kf.split(train_x)):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train, **self.fit_params)
      predictions = self.model.predict(X_test)

      row = {}
      for metric in metrics:
        if self.model_type == "rgr" and metric in class_metrics:
          score = metrics_functions_map[metric](
              y_test,
              np.round(predictions)
          )
        else:
          score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y, **self.fit_params)
    predictions = self.model.predict(train_x)
    if self.model_type == "rgr":
      mean["train_acc"] = accuracy_score(np.round(predictions), train_y)
    else:
      mean["train_acc"] = accuracy_score(predictions, train_y)
    result_df.loc["mean"] = mean

    return result_df

  def run(self, prep_args={}, **kwargs):
    self.prep_args = prep_args
    self.train_df = self.preprocess(self.train_df, **self.prep_args)
    self.train_df = self.train_df.dropna()
    self.train_df = self.train_df.reset_index()
    self.train_df = self.train_df.drop(["index"], axis=1)
    return self.evaluate(**kwargs)

  def make_submission(self, test_df, submission_df):
    test_df = self.preprocess(test_df, **self.prep_args)

    # handle nan values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp = imp.fit(test_df)
    test_df = imp.transform(test_df)

    
    preds = self.model.predict(test_df)
    submission_df["nerdiness"] = preds
    return submission_df


In [None]:
train_df[["age", "familysize"]].describe()

Unnamed: 0,age,familysize
count,15000.0,14681.0
mean,26.740867,2.744091
std,317.058436,24.109344
min,13.0,1.0
25%,17.0,2.0
50%,20.0,2.0
75%,27.0,3.0
max,38822.0,2919.0


In [None]:
train_df[["testelapse", "introelapse", "surveyelapse"]].describe()

Unnamed: 0,testelapse,introelapse,surveyelapse
count,15000.0,15000.0,15000.0
mean,387.965667,767.137733,2787.959
std,8513.03161,13835.948037,178595.5
min,1.0,1.0,3.0
25%,82.0,4.0,126.0
50%,106.0,10.0,164.0
75%,140.0,36.0,217.0
max,474572.0,855030.0,15166990.0


In [None]:
fraud_df = train_df[
    (train_df["age"] >= 100) | \
    (train_df["familysize"] >= 10) | \
    (train_df["testelapse"] >= 500) | \
    (train_df["introelapse"] >= 200) | \
    (train_df["surveyelapse"] >= 1000)
]
fraud_df[["age", "familysize", "testelapse", "introelapse", "surveyelapse", "nerdiness"]]

Unnamed: 0,age,familysize,testelapse,introelapse,surveyelapse,nerdiness
0,20,4.0,553,3,6,1
4,18,1.0,640,3,216,0
24,15,4.0,601,4,208,1
28,18,3.0,88,826,104,1
35,13,3.0,64,5558,280,0
...,...,...,...,...,...,...
14962,22,2.0,59,1108,123,0
14974,31,1.0,148,2,1043,0
14977,25,1.0,128,339,242,0
14991,80,3.0,382,408,198,1


In [None]:
fraud_df[fraud_df["nerdiness"] == 1][["age", "familysize", "testelapse", "introelapse", "surveyelapse", "nerdiness"]]

Unnamed: 0,age,familysize,testelapse,introelapse,surveyelapse,nerdiness
0,20,4.0,553,3,6,1
24,15,4.0,601,4,208,1
28,18,3.0,88,826,104,1
38,17,3.0,86,317,165,1
41,33,,81,637,125,1
...,...,...,...,...,...,...
14908,21,2.0,82,790,167,1
14918,30,13.0,75,7,150,1
14940,15,2.0,201,19723,290,1
14991,80,3.0,382,408,198,1


### Baseline

In [None]:
evaluator = Evaluator(
    **Model(train_df, "lgbm", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.749922,0.755385,0.820156,0.786439,0.821222,0.346428,
2,0.743982,0.740375,0.816408,0.776535,0.824008,0.346583,
3,0.746483,0.751298,0.813378,0.781107,0.821759,0.349007,
4,0.751797,0.774987,0.80271,0.788605,0.820509,0.344981,
mean,0.748046,0.755511,0.813163,0.783171,0.821874,0.34675,0.849719


In [None]:
evaluator = Evaluator(
    **Model(train_df, "xgb", "rgr").get_model()
)
evaluator.run(prep_args={"normalize": False})



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.732104,0.736286,0.814588,0.77346,0.801144,0.365563,
2,0.732729,0.724242,0.822719,0.770346,0.808841,0.363548,
3,0.733979,0.736012,0.813378,0.772764,0.802543,0.368768,
4,0.733042,0.755018,0.795122,0.774551,0.799655,0.365598,
mean,0.732963,0.73789,0.811452,0.77278,0.803046,0.365869,0.758675


In [None]:
evaluator = Evaluator(
    **Model(train_df, "rf", "rgr").get_model()
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.780556,0.790648,0.828508,0.809135,0.860718,0.325755,
2,0.783057,0.780032,0.83821,0.808075,0.863465,0.329456,
3,0.772741,0.781283,0.821248,0.800767,0.854853,0.333792,
4,0.778056,0.799789,0.820596,0.810059,0.85702,0.328009,
mean,0.778603,0.787938,0.82714,0.807009,0.859014,0.329253,1.0


In [None]:
evaluator = Evaluator(
    **Model(train_df, "et", "rgr").get_model()
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.781494,0.793787,0.825167,0.809173,0.868042,0.29196,
2,0.780869,0.780108,0.832473,0.80544,0.873808,0.294511,
3,0.773679,0.782236,0.82181,0.801535,0.866363,0.300228,
4,0.783057,0.808249,0.817886,0.813039,0.866361,0.294573,
mean,0.779775,0.791095,0.824334,0.807297,0.868644,0.295318,1.0


In [None]:
evaluator = Evaluator(
    **Model(train_df, "cat", "rgr").get_model()
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.762426,0.768394,0.825724,0.796028,0.840535,0.334397,
2,0.777118,0.768509,0.845668,0.805244,0.847041,0.333522,
3,0.765864,0.766563,0.83249,0.798168,0.836825,0.337978,
4,0.763676,0.784038,0.814634,0.799043,0.83202,0.33553,
mean,0.767271,0.771876,0.829629,0.799621,0.839105,0.335357,0.92146


## Preprocessing

### eliminate outliers

In [None]:
train_df = train_df[
    (train_df["age"] < 100) & \
    (train_df["familysize"] < 10) & \
    (train_df["testelapse"] < 500) & \
    (train_df["introelapse"] < 200) & \
    (train_df["surveyelapse"] < 1000)
]
train_df

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness,country_idx
1,1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,...,49,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1,15
2,2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,...,43,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1,66
3,3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,...,17,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1,15
5,5,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,...,26,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,15
6,6,4.0,3.0,4.0,3.0,5.0,4.0,5.0,4.0,5.0,...,40,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,2.0,5.0,4.0,3.0,3.0,4.0,4.0,4.0,3.0,...,17,1.0,1.0,3.0,2.0,1.0,3.0,2.0,0,15
14996,14996,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,...,45,1.0,3.0,1.0,1.0,2.0,3.0,2.0,1,15
14997,14997,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,...,20,1.0,1.0,2.0,1.0,1.0,3.0,1.0,1,15
14998,14998,5.0,5.0,4.0,5.0,5.0,5.0,5.0,1.0,5.0,...,29,1.0,12.0,4.0,2.0,2.0,2.0,1.0,0,15


In [None]:
evaluator = Evaluator(
    **Model(train_df, "lgbm", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.743048,0.748926,0.81117,0.778806,0.819228,0.344349,
2,0.740452,0.745399,0.81,0.776358,0.824785,0.346065,
3,0.761587,0.773136,0.80576,0.789111,0.828879,0.344554,
4,0.746662,0.753079,0.812625,0.781719,0.82217,0.346221,
mean,0.747937,0.755135,0.809889,0.781499,0.823765,0.345298,0.866599


In [None]:
evaluator = Evaluator(
    **Model(train_df, "xgb", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.72525,0.730792,0.803191,0.765283,0.798766,0.363057,
2,0.735632,0.733254,0.824667,0.776279,0.807137,0.364753,
3,0.738598,0.74533,0.801741,0.772507,0.806669,0.364811,
4,0.726632,0.733293,0.801993,0.766106,0.801986,0.365923,
mean,0.731528,0.735667,0.807898,0.770044,0.803639,0.364636,0.763048


In [None]:
evaluator = Evaluator(
    **Model(train_df, "rf", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.769744,0.779607,0.818484,0.798573,0.853773,0.330349,
2,0.77716,0.780412,0.834,0.806316,0.856424,0.332469,
3,0.777531,0.797468,0.801741,0.799599,0.85502,0.333033,
4,0.767062,0.772191,0.826578,0.79846,0.853995,0.333628,
mean,0.772874,0.78242,0.820201,0.800737,0.854803,0.33237,1.0


In [None]:
evaluator = Evaluator(
    **Model(train_df, "et", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.77234,0.784891,0.81516,0.799739,0.86407,0.296937,
2,0.778272,0.785805,0.826667,0.805718,0.864398,0.300527,
3,0.779014,0.8,0.801072,0.800535,0.866625,0.299837,
4,0.767062,0.77355,0.82392,0.797941,0.858082,0.304711,
mean,0.774172,0.786061,0.816705,0.800983,0.863294,0.300503,1.0


In [None]:
evaluator = Evaluator(
    **Model(train_df, "cat", "rgr").get_model(),
)
evaluator.run(prep_args={"normalize": False})

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.758621,0.763109,0.822473,0.79168,0.835435,0.334604,
2,0.757508,0.759509,0.825333,0.791054,0.83873,0.334642,
3,0.773823,0.782832,0.818486,0.800262,0.843943,0.333365,
4,0.759644,0.763692,0.824585,0.792971,0.834329,0.336563,
mean,0.762399,0.767286,0.822719,0.793992,0.838109,0.334793,0.932511


### After Normalization

In [None]:
normed_df = (train_df - train_df.min()) / (train_df.max() - train_df.min())

In [None]:
evaluator = Evaluator(
    **Model(normed_df, "lgbm", "rgr").get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.74367,0.752588,0.809577,0.780043,0.816906,0.349409,
2,0.742107,0.738565,0.815261,0.77502,0.82442,0.346759,
3,0.746171,0.749871,0.815627,0.781368,0.819269,0.35119,
4,0.754611,0.780423,0.799458,0.789826,0.816963,0.347524,
mean,0.74664,0.755362,0.809981,0.781564,0.81939,0.34872,0.849406


In [None]:
evaluator = Evaluator(
    **Model(normed_df, "xgb", "rgr").get_model(),
)
evaluator.run()



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.732729,0.737746,0.812918,0.77351,0.800424,0.365914,
2,0.732104,0.724417,0.820425,0.769438,0.808404,0.364171,
3,0.736793,0.737456,0.817875,0.775586,0.801825,0.368956,
4,0.740231,0.76188,0.799458,0.780217,0.801293,0.365471,
mean,0.735464,0.740375,0.812669,0.774688,0.802986,0.366128,0.760238


In [None]:
evaluator = Evaluator(
    **Model(normed_df, "rf", "rgr").get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.782119,0.795589,0.823497,0.809302,0.855931,0.333029,
2,0.779619,0.774313,0.840505,0.806052,0.860658,0.335392,
3,0.773367,0.777077,0.830804,0.803043,0.85003,0.338937,
4,0.785245,0.805058,0.828184,0.816457,0.85364,0.336108,
mean,0.780088,0.788009,0.830747,0.808714,0.855065,0.335867,1.0


In [None]:
evaluator = Evaluator(
    **Model(normed_df, "et", "rgr").get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.780556,0.792827,0.82461,0.808406,0.869494,0.294414,
2,0.786808,0.786292,0.835915,0.810345,0.874961,0.297065,
3,0.770866,0.779679,0.819562,0.799123,0.863998,0.30382,
4,0.781182,0.805008,0.81897,0.811929,0.863401,0.299784,
mean,0.779853,0.790952,0.824764,0.807451,0.867964,0.298771,1.0


In [None]:
evaluator = Evaluator(
    **Model(normed_df, "cat", "rgr").get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.757737,0.767979,0.814588,0.790597,0.83134,0.340419,
2,0.761488,0.755208,0.831899,0.791701,0.838366,0.338559,
3,0.761175,0.762274,0.829117,0.794292,0.835788,0.33991,
4,0.7593,0.783342,0.80542,0.794228,0.828243,0.338813,
mean,0.759925,0.767201,0.820256,0.792704,0.833434,0.339425,0.919897


### hyeonbin's parameter

In [None]:
normed_df = (train_df - train_df.min()) / (train_df.max() - train_df.min())
normed_df

In [None]:
params = {
    'objective': 'binary', 
    'n_estimators': 1000,
    'learning_rate': 0.01, 
    'random_state': 42, 
    'metric': 'auc', 
    'verbose': -1, 
    'feature_pre_filter': False, 
    'lambda_l1': 1.001449223665896e-08, 
    'lambda_l2': 4.805472362670594e-05, 
    'num_leaves': 243, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.8823839979334422, 
    'bagging_freq': 5, 
    'min_child_samples': 5
}
evaluator = Evaluator(
    **Model(normed_df, "lgbm", "rgr", **params).get_model(),
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.786496,0.79182,0.840757,0.815555,0.867838,0.273317,
2,0.787434,0.779003,0.851406,0.813596,0.871997,0.273908,
3,0.781494,0.778638,0.848229,0.811945,0.864218,0.279318,
4,0.794936,0.811094,0.840108,0.825346,0.860369,0.274345,
mean,0.78759,0.790138,0.845125,0.816611,0.866105,0.275222,1.0
