In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/AI competition/Round 1")
os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['model compare.ipynb',
 'competition_data',
 '__pycache__',
 'html',
 'submission',
 'logs.log',
 'playground.ipynb',
 '2022-08-04 feature EDA.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 'evaluator.py',
 'Optuna Optimization.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb"]

In [2]:
from evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")
train_df.head()

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,engnat,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness
0,0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,...,1.0,20,2.0,12.0,4.0,2.0,1.0,4.0,2.0,1
1,1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,...,1.0,49,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1
2,2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,...,2.0,43,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1
3,3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,...,1.0,17,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1
4,4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,...,2.0,18,2.0,12.0,1.0,2.0,1.0,1.0,2.0,0


## Baseline

In [43]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import (
    RandomForestClassifier, 
    RandomForestRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
)
from sklearn.impute import SimpleImputer

class Model():
  def __init__(self, model_name="lgbm", model_type="clf", **params):
    if model_name == "lgbm":
      if model_type == "clf":
        self.model = LGBMClassifier(**params)
      elif model_type == "rgr":
        self.model = LGBMRegressor(**params)
    elif model_name == "xgb":
      if model_type == "clf":
        self.model = XGBClassifier(**params)
      elif model_type == "rgr":
        self.model = XGBRegressor(**params)
    elif model_name == "et":
      if model_type == "clf":
        self.model = ExtraTreesClassifier(**params)
      elif model_type == "rgr":
        self.model = ExtraTreesRegressor(**params)
    elif model_name == "rf":
      if model_type == "clf":
        self.model = RandomForestClassifier(**params)
      elif model_type == "rgr":
        self.model = RandomForestRegressor(**params)

  def get_model(self):
    return self.model

class Evaluator():
  def __init__(self, model, train_df, n_folds=4, random_state=42):
    self.kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    self.model = model
    self.train_df = train_df

  def drop_col(self, df, col_list=["index", "country"]):
    return df.drop(col_list, axis=1)

  def index_col(self, df, col_list=["country"]):
    def _indexer(col):
      col_list = list(set(df[col]))
      col_map = {
          c: i 
          for i, c in enumerate(col_list)
      }
      return df[col].apply(lambda x: col_map[x])

    for col in col_list:
      df[f"{col}_idx"] = _indexer(col)
    return self.drop_col(df, col_list)

  def preprocess(self, df=None, mode="index"):
    assert mode in ("index", "drop")

    df = self.train_df if df is None else df

    if mode == "drop":
      return self.drop_col(df)
    elif mode == "index":
      df = self.index_col(df)
      return self.drop_col(df, col_list=["index"])

  def evaluate(self, metrics="all", train_acc=True):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in tqdm(enumerate(self.kf.split(train_x))):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train)
      predictions = self.model.predict(X_test)

      row = {}
      for metric in metrics:
        score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y)
    predictions = self.model.predict(train_x)
    mean["train_acc"] = accuracy_score(predictions, train_y)
    result_df.loc["mean"] = mean

    return result_df

  def run(self, **kwargs):
    self.train_df = self.preprocess(self.train_df)
    self.train_df = self.train_df.dropna()
    self.train_df = self.train_df.reset_index()
    self.train_df = self.train_df.drop(["index"], axis=1)
    return self.evaluate(**kwargs)

  def make_submission(self, test_df, submission_df):
    test_df = self.preprocess(test_df)

    # handle nan values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp = imp.fit(test_df)
    test_df = imp.transform(test_df)

    preds = self.model.predict(test_df)
    submission_df["nerdiness"] = preds
    return submission_df


In [37]:
evaluator = Evaluator(
    model=Model("lgbm", "clf").get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.760238,0.768668,0.819599,0.793317,0.751924,0.239762,
2,0.755236,0.748705,0.82903,0.786823,0.747963,0.244764,
3,0.749297,0.750641,0.822372,0.784871,0.740059,0.250703,
4,0.750547,0.774803,0.8,0.7872,0.741581,0.249453,
mean,0.753829,0.760704,0.81775,0.788053,0.745382,0.246171,0.851516


In [38]:
submission = evaluator.make_submission(test_df, submission_df)
submission.to_csv("submission/2022-08-05_LGBM_base.csv", index=False)

In [44]:
evaluator = Evaluator(
    model=Model("xgb", "clf").get_model(),
    train_df=train_df
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.735542,0.742843,0.80902,0.77452,0.725251,0.264458,
2,0.730541,0.722138,0.821572,0.768653,0.721569,0.269459,
3,0.731791,0.735309,0.808881,0.770343,0.722046,0.268209,
4,0.739606,0.762448,0.796748,0.779221,0.729245,0.260394,
mean,0.73437,0.740684,0.809055,0.773184,0.724528,0.26563,0.759534


In [45]:
submission = evaluator.make_submission(test_df, submission_df)
submission.to_csv("submission/2022-08-05_XGB_base.csv", index=False)

In [29]:
evaluator = Evaluator(
    model=Model("rf", "clf").get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.782432,0.788866,0.836303,0.811892,0.774887,0.217568,
2,0.787746,0.78,0.850258,0.813615,0.781585,0.212254,
3,0.772741,0.772539,0.838111,0.80399,0.764478,0.227259,
4,0.779306,0.801163,0.821138,0.811028,0.771721,0.220694,
mean,0.780556,0.785642,0.836453,0.810131,0.773168,0.219444,1.0


In [30]:
submission = evaluator.make_submission(test_df, submission_df)
submission.to_csv("submission/2022-08-05_RF_base.csv", index=False)

In [31]:
evaluator = Evaluator(
    model=Model("et", "clf").get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.783995,0.792173,0.834076,0.812585,0.776981,0.216005,
2,0.783682,0.772987,0.853701,0.811341,0.776782,0.216318,
3,0.779931,0.776919,0.847667,0.810753,0.771369,0.220069,
4,0.785871,0.803347,0.83252,0.817674,0.777412,0.214129,
mean,0.78337,0.786357,0.841991,0.813088,0.775636,0.21663,1.0


In [32]:
submission = evaluator.make_submission(test_df, submission_df)
submission.to_csv("submission/2022-08-05_ET_base.csv", index=False)

## Optimization

### sojeong's Optimization

In [None]:
params = {'num_leaves': 1346, 'n_estimators': 129, 'max_bin': 31, 'learning_rate': 0.11872771895424405}

evaluator = Evaluator(
    model=Model("lgbm", "clf", **params).get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.784933,0.793122,0.834633,0.813348,0.777972,0.215067,
2,0.782432,0.777131,0.842226,0.80837,0.776539,0.217568,
3,0.775555,0.778478,0.833614,0.805103,0.768216,0.224445,
4,0.779931,0.79979,0.824932,0.812166,0.771772,0.220069,
mean,0.780713,0.78713,0.833851,0.809747,0.773625,0.219287,1.0


In [None]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("2022-08-05_LGBM_optim_20.csv", index=False)

### lgbm optuna 2nd round (n=200)

In [47]:
params = {
  "objective": "binary",
  "n_estimators": 200,
  'learning_rate': 0.050815587001436395,
  'num_leaves': 259,
  'reg_alpha': 0.00011167629235521655,
  'reg_lambda': 1.784826134719029e-08
}
evaluator = Evaluator(
    model=Model("lgbm", "clf", **params).get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.783682,0.785714,0.845212,0.814378,0.775065,0.216318,
2,0.784308,0.77551,0.850258,0.811166,0.777808,0.215692,
3,0.778993,0.778297,0.842608,0.809177,0.770952,0.221007,
4,0.776493,0.800213,0.81626,0.808157,0.769282,0.223507,
mean,0.780869,0.784934,0.838585,0.810719,0.773277,0.219131,1.0


In [48]:
submission_df = evaluator.make_submission(test_df, submission_df)
submission_df.to_csv("2022-08-05_LGBM_optim_200.csv", index=False)

### hyeonbin's optimization

In [49]:
params = {
    'objective': 'binary', 
    'learning_rate': 0.01, 
    'random_state': 42, 
    'metric': 'auc', 
    'verbose': -1, 
    'feature_pre_filter': False, 
    'lambda_l1': 1.001449223665896e-08, 
    'lambda_l2': 4.805472362670594e-05, 
    'num_leaves': 243, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.8823839979334422, 
    'bagging_freq': 5, 
    'min_child_samples': 5,
}

In [51]:
evaluator = Evaluator(
    model=Model("lgbm", "clf", **params).get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.766177,0.749524,0.876392,0.808008,0.750741,0.233823,
2,0.755861,0.723721,0.892714,0.799384,0.742373,0.244139,
3,0.754923,0.730644,0.885891,0.800813,0.738368,0.245077,
4,0.766177,0.760076,0.868835,0.810824,0.747564,0.233823,
mean,0.760785,0.740991,0.880958,0.804757,0.744761,0.239215,0.88262


### lgbm optuna 3rd round

In [4]:
params = {
  "objective": "binary",
  "n_estimators": 500,
  'learning_rate': 0.026332779906149555,
  'num_leaves': 955,
  'reg_alpha': 6.90331310095056e-08,
  'reg_lambda': 2.30837413695962e-06
}
evaluator = Evaluator(
    model=Model("lgbm", "clf", **params).get_model(),
    train_df=train_df,
)
evaluator.run()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.79181,0.797368,0.843541,0.819805,0.784565,0.20819,
2,0.776805,0.771791,0.83821,0.80363,0.770753,0.223195,
3,0.777118,0.777315,0.839798,0.807349,0.769195,0.222882,
4,0.773054,0.795251,0.816802,0.805882,0.765122,0.226946,
mean,0.779697,0.785431,0.834588,0.809167,0.772409,0.220303,1.0
