In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/competition/2022-AI-competition-Round1") # Local Path
os.listdir()

Mounted at /content/drive


['competition_data',
 'html',
 'submission',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 'catboost_info',
 'evaluator',
 '.git',
 '.gitignore',
 'README.md',
 'Updated CLF.ipynb',
 '2022-08-17 Ensemble Module.ipynb',
 '2022-08-11 Models(XGB_ET) Optimization.ipynb',
 'GitHub Connection.ipynb']

In [2]:
!pip install -r evaluator/requirements.txt


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 92 kB/s 
[?25hCollecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 75.2 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 11.5 MB/s 
[?25hCollecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 74.6 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.2.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.4 MB/s 
Collecting stevedore>=2.0.1
  Downloading stevedore-3.5.0-py3-no

In [3]:
from evaluator.evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")

In [None]:
from sklearn.ensemble import VotingRegressor

In [19]:
lgbm_params = {
  "objective": "binary",
  'metric': 'auc', 
  "n_estimators": 500,
  'learning_rate': 0.026332779906149555,
  'num_leaves': 955,
  'reg_alpha': 6.90331310095056e-08,
  'reg_lambda': 2.30837413695962e-06,
  'feature_fraction': 0.4, 
  'bagging_fraction': 0.8823839979334422, 
  'bagging_freq': 5, 
  'min_child_samples': 5,
}
lgbm_model = Model(
    train_df, "lgbm", "rgr", **lgbm_params
).get_model()['model']

In [None]:
et_params = {
	'n_estimators': 292, 
	'max_depth': 45, 
	'min_samples_split': 2, 
	'min_samples_leaf': 1, 
	'max_features': 0.729627795893577
}
et_model = Model(
    train_df, "et", "rgr", **et_params
).get_model()['model']

In [None]:
xgb_params = {
    "objective": "reg:squarederror",
    "average": "micro",
    'n_estimators': 746, 
    'learning_rate': 0.0181863245309935, 
    'gamma': 0, 
    'max_depth': 11, 
    'lambda': 1, 
    'alpha': 4, 
    'subsample': 0.7231465475519483
}
xgb_model = Model(
    train_df, "xgb", "rgr", **xgb_params
).get_model()['model']

In [20]:
cat_params = {
  'learning_rate': 0.01001831305173243,
  'bagging_temperature': 5.20036760425302,
  'n_estimators': 120, 
  'max_depth': 16,
  'random_strength': 0,
  'colsample_bylevel': 0.984260923537832,
  'l2_leaf_reg': 1.0732827610812122e-05,
  'min_child_samples': 100,
  'max_bin': 302,
  'od_type': 'Iter'
}
cat_model = Model(
    train_df, "cat", "rgr", **cat_params
).get_model()['model']

In [None]:
ensemble_model = VotingRegressor([
    ('lgbm', lgbm_model), 
    ('et', et_model), 
    ('xgb', xgb_model),
    ('cat', cat_model),
])

In [None]:
ensemble_model

VotingRegressor(estimators=[('lgbm',
                             LGBMRegressor(bagging_fraction=0.8823839979334422,
                                           bagging_freq=5, feature_fraction=0.4,
                                           learning_rate=0.026332779906149555,
                                           metric='auc', min_child_samples=5,
                                           n_estimators=500, num_leaves=955,
                                           objective='binary',
                                           reg_alpha=6.90331310095056e-08,
                                           reg_lambda=2.30837413695962e-06)),
                            ('et',
                             ExtraTreesRegressor(max_depth=45,
                                                 max_features=0.729627795893577,
                                                 n_estimators=292)),
                            ('xgb',
                             XGBRegressor(alpha=4, average='micro', 

In [None]:
evaluator = Evaluator(
    train_df=train_df, 
    model = ensemble_model,
    model_name = "ensemble",
    model_type = "rgr"
)

In [None]:
evaluator.run()

0:	learn: 0.4921645	total: 2.5s	remaining: 4m 57s
1:	learn: 0.4878246	total: 4.93s	remaining: 4m 50s
2:	learn: 0.4835046	total: 7.34s	remaining: 4m 46s
3:	learn: 0.4793689	total: 9.78s	remaining: 4m 43s
4:	learn: 0.4751071	total: 12.2s	remaining: 4m 41s
5:	learn: 0.4709259	total: 14.7s	remaining: 4m 38s
6:	learn: 0.4668351	total: 17.1s	remaining: 4m 35s
7:	learn: 0.4627988	total: 19.4s	remaining: 4m 31s
8:	learn: 0.4587634	total: 21.8s	remaining: 4m 28s
9:	learn: 0.4548218	total: 24.3s	remaining: 4m 27s
10:	learn: 0.4507604	total: 26.7s	remaining: 4m 24s
11:	learn: 0.4468384	total: 29.3s	remaining: 4m 23s
12:	learn: 0.4428657	total: 31.9s	remaining: 4m 22s
13:	learn: 0.4389955	total: 34.3s	remaining: 4m 19s
14:	learn: 0.4351079	total: 36.7s	remaining: 4m 17s
15:	learn: 0.4313491	total: 39.2s	remaining: 4m 14s
16:	learn: 0.4277602	total: 41.7s	remaining: 4m 12s
17:	learn: 0.4240642	total: 44.1s	remaining: 4m 10s
18:	learn: 0.4204291	total: 46.6s	remaining: 4m 7s
19:	learn: 0.4167167	tot

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.786496,0.785531,0.85245,0.817623,0.875918,0.299258,
2,0.790247,0.774872,0.866896,0.818305,0.880087,0.301673,
3,0.780556,0.770739,0.86172,0.813694,0.872539,0.306593,
4,0.790247,0.798272,0.851491,0.824023,0.872067,0.300881,
mean,0.786887,0.782354,0.858139,0.818411,0.875153,0.302101,1.0


In [None]:
from sklearn.impute import SimpleImputer


test_df = evaluator.preprocess(test_df)

# handle nan values
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp = imp.fit(test_df)
test_df = pd.DataFrame(
  imp.transform(test_df),
  columns=test_df.columns
)

preds = evaluator.model.predict(test_df)
submission_df["nerdiness"] = preds

submission_df.to_csv("submission/2022-08-17_ENSEMBLE_v0_1.csv", index=False)

## Modularization

In [4]:
from evaluator.evaluator import Model, Evaluator, Optimizer
from sklearn.ensemble import VotingRegressor

class MyEvaluator(Evaluator):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

class MyModel(Model):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def optimize(self, initial_params, **kwargs):
    self.optimizer = MyOptimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class MyOptimizer(Optimizer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = MyEvaluator(
        **MyModel(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).run(train_acc=False)

    return result_df["roc_auc"]["mean"]

In [17]:
import yaml

with open('evaluator/best_models_config.yaml') as f:
  model_config = yaml.load(f, Loader=yaml.FullLoader)

In [18]:
model_config

{'lgbm': [{'name': 'LightGBM',
   'type': 'rgr',
   'best_score': 0.880712,
   'best_params': {'objective': 'binary',
    'metric': 'auc',
    'n_estimators': 500,
    'learning_rate': 0.026332779906149555,
    'num_leaves': 955,
    'reg_alpha': 6.90331310095056e-08,
    'reg_lambda': 2.30837413695962e-06,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.8823839979334422,
    'bagging_freq': 5,
    'min_child_samples': 5}}],
 'xgb': [{'name': 'XGBoost',
   'type': 'rgr',
   'best_score': 0.869358,
   'best_params': {'objective': 'reg:squarederror',
    'average': 'micro',
    'n_estimators': 746,
    'learning_rate': 0.0181863245309935,
    'gamma': 0,
    'max_depth': 11,
    'lambda': 1,
    'alpha': 4,
    'subsample': 0.7231465475519483}}],
 'cat': [{'name': 'CatBoost',
   'type': 'rgr',
   'best_score': 0.86791,
   'best_params': {'learning_rate': 0.01001831305173243,
    'bagging_temperature': 5.20036760425302,
    'n_estimators': 120,
    'max_depth': 16,
    'random_stre

In [45]:
from sklearn.ensemble import VotingRegressor

class VotingModel(VotingRegressor):
  def __init__(self, model_types, estimators):
    self.model_types = model_types
    super().__init__(estimators)

  def _validate_estimators(self):
      if self.estimators is None or len(self.estimators) == 0:
          raise ValueError(
              "Invalid 'estimators' attribute, 'estimators' should be a list"
              " of (string, estimator) tuples."
          )
      names, estimators = zip(*self.estimators)
      # defined by MetaEstimatorMixin
      self._validate_names(names)

      has_estimator = any(est != "drop" for est in estimators)
      if not has_estimator:
          raise ValueError(
              "All estimators are dropped. At least one is required "
              "to be an estimator."
          )

      return names, estimators

  def predict(self, X):
    return np.average(
      np.asarray([
        est.predict(X) if model_type=="rgr" else est.predict_proba(X)[:,1]
        for model_type, est in zip(self.model_types, self.estimators_)
      ]).T, 
      axis=1, 
      weights=self._weights_not_none
    )


In [46]:
import yaml

class EnsembleModel():
  def __init__(
    self, 
    train_df,
    models=["lgbm", "xgb", "cat", "rf", "et"]
  ):
    with open('evaluator/best_models_config.yaml') as f:
      model_configs = yaml.load(f, Loader=yaml.FullLoader)

    self.train_df = train_df

    estimators = []
    model_types = []
    for model_name in models:
        configs = model_configs[model_name][0]
        model_type = configs["type"]
        params = configs["best_params"]
        
        estimators.append((
            model_name, 
            Model(train_df, model_name, model_type, **params).get_model()['model']
        ))
        model_types.append(model_type)

    self.ensemble_model = VotingModel(
        estimators=estimators,
        model_types=model_types,
    )

  def get_model(self):
    return {
        'train_df': self.train_df,
        'model': self.ensemble_model,
        'model_name': 'ensemble',
        'model_type': 'rgr',
    }

In [None]:
evaluator = Evaluator(
    **EnsembleModel(train_df, models=['lgbm', 'xgb', 'et', 'rf']).get_model()
).run()

In [None]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("submission/2022-08-20_ENSEMBLE_v0_2.csv", index=False)