In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/AI competition/Round 1")
os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['competition_data',
 '__pycache__',
 'html',
 'submission',
 'logs.log',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb',
 'evaluator.py',
 '2022-08-06 Optimize ET.ipynb',
 '2022-08-06 Regressor model compare.ipynb',
 '2022-08-09 Data Preprocessing.ipynb']

In [None]:
# !pip install optuna
!pip install catboost

In [6]:
from evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")
train_df.head()

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,engnat,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness
0,0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,...,1.0,20,2.0,12.0,4.0,2.0,1.0,4.0,2.0,1
1,1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,...,1.0,49,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1
2,2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,...,2.0,43,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1
3,3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,...,1.0,17,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1
4,4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,...,2.0,18,2.0,12.0,1.0,2.0,1.0,1.0,2.0,0


In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from collections import defaultdict
import pandas as pd
import numpy as np
# from tqdm.notebook import tqdm

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import (
    RandomForestClassifier, 
    RandomForestRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
)
from sklearn.impute import SimpleImputer
import optuna

class Model():
  def __init__(self, train_df, model_name="lgbm", model_type="clf", **params):
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    self.model = None
    if model_name == "lgbm":
      if model_type == "clf":
        self.model = LGBMClassifier(**params)
      elif model_type == "rgr":
        self.model = LGBMRegressor(**params)
    elif model_name == "xgb":
      if model_type == "clf":
        self.model = XGBClassifier(**params)
      elif model_type == "rgr":
        self.model = XGBRegressor(**params)
    elif model_name == "et":
      if model_type == "clf":
        self.model = ExtraTreesClassifier(**params)
      elif model_type == "rgr":
        self.model = ExtraTreesRegressor(**params)
    elif model_name == "rf":
      if model_type == "clf":
        self.model = RandomForestClassifier(**params)
      elif model_type == "rgr":
        self.model = RandomForestRegressor(**params)
    elif model_name == "cat":
      if model_type == "clf":
        self.model = CatBoostClassifier(**params)
      elif model_type == "rgr":
        self.model = CatBoostRegressor(**params)
    else:
      raise NameError("model_name must be in ('lgbm', 'xgb', 'rf', 'et', 'cat')")
    
    if self.model is None:
      raise NameError("model_type must be in ('clf', 'rgr')")

  def get_model(self):
    return {
        "train_df": self.train_df,
        "model": self.model,
        "model_name": self.model_name,
        "model_type": self.model_type,
    }

  def optimize(self, initial_params, **kwargs):
    self.optimizer = Optimizer(
        self.train_df, 
        initial_params, 
        self.model_name,
        self.model_type,
    )
    best_params = self.optimizer.run(**kwargs)
    self.__init__(self.train_df, self.model_name, self.model_type, **best_params)

class Optimizer():
  def __init__(self, train_df, initial_params, model_name, model_type, random_seed=42):
    self.params = {}
    self.train_df = train_df
    self.initial_params = initial_params
    self.model_name = model_name
    self.model_type = model_type
    self.random_seed = random_seed

  def objective(self, trial):
    ## Tuning Parmeters
    for param, dtype, value in self.initial_params:
      if dtype == "static":
        self.params[param] = value
      elif dtype == "int":
        self.params[param] = trial.suggest_int(param, *value)
      elif dtype == "float":
        self.params[param] = trial.suggest_uniform(param, *value)
      elif dtype == "log":
        self.params[param] = trial.suggest_loguniform(param, *value)
      elif dtype == "categorical":
        self.params[param] = trial.suggest_categorical(param, *value)
      else:
        raise NameError("dtype must be one of ('static', 'int', 'float', 'log', 'categorical')")

    ## Objective Metric
    result_df = Evaluator(
        **Model(self.train_df, self.model_name, self.model_type, **self.params).get_model()
    ).run(train_acc=False)

    return result_df["roc_auc"]["mean"]

  def optimize(self, n_trials=100, sampling="TPE"):
    if sampling == "random":
      sampler = optuna.samplers.RandomSampler(seed=self.random_seed)
    elif sampling == "TPE":
      sampler = optuna.samplers.TPESampler()

    self.opt = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    self.opt.optimize(self.objective, n_trials=n_trials)

  def analyze(self):
    optuna.visualization.plot_param_importances(self.opt)
    optuna.visualization.plot_optimization_history(self.opt)
    optuna.visualization.plot_slice(self.opt)

  def best_params(self):
    print(self.opt.best_trial.value)
    print(self.opt.best_trial.params)
    return self.opt.best_trial.params

  def run(self, **kwargs):
    self.optimize(**kwargs)
    self.analyze()
    return self.best_params()

class Evaluator():
  def __init__(self, model, train_df, n_folds=4, random_state=42, model_name=None, model_type="clf"):
    self.kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    self.model = model
    self.model_name = model_name
    self.model_type = model_type
    self.train_df = train_df
    if self.model_name == 'cat':
      self.fit_params = {'silent': True}
    else:
      self.fit_params = {}

  def drop_col(self, df, col_list=["index", "country"]):
    return df.drop(col_list, axis=1)

  def index_col(self, df, col_list=["country"]):
    def _indexer(col):
      col_list = list(set(df[col]))
      col_map = {
          c: i 
          for i, c in enumerate(col_list)
      }
      return df[col].apply(lambda x: col_map[x])

    for col in col_list:
      df[f"{col}_idx"] = _indexer(col)
    return self.drop_col(df, col_list)

  def preprocess(self, df=None, mode="index"):
    assert mode in ("index", "drop")

    df = self.train_df if df is None else df

    if mode == "drop":
      return self.drop_col(df)
    elif mode == "index":
      df = self.index_col(df)
      return self.drop_col(df, col_list=["index"])

  def evaluate(self, metrics="all", train_acc=True):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    class_metrics = {
        "accuracy",
        "precision",
        "recall",
        "f1-score",
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in enumerate(self.kf.split(train_x)):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train, **self.fit_params)
      predictions = self.model.predict(X_test)

      row = {}
      for metric in metrics:
        if self.model_type == "rgr" and metric in class_metrics:
          score = metrics_functions_map[metric](
              y_test,
              np.round(predictions)
          )
        else:
          score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y, **self.fit_params)
    predictions = self.model.predict(train_x)
    if self.model_type == "rgr":
      mean["train_acc"] = accuracy_score(np.round(predictions), train_y)
    else:
      mean["train_acc"] = accuracy_score(predictions, train_y)
    result_df.loc["mean"] = mean

    return result_df

  def run(self, **kwargs):
    self.train_df = self.preprocess(self.train_df)
    self.train_df = self.train_df.dropna()
    self.train_df = self.train_df.reset_index()
    self.train_df = self.train_df.drop(["index"], axis=1)
    return self.evaluate(**kwargs)

  def make_submission(self, test_df, submission_df):
    test_df = self.preprocess(test_df)

    # handle nan values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp = imp.fit(test_df)
    test_df = imp.transform(test_df)

    
    preds = self.model.predict(test_df)
    submission_df["nerdiness"] = preds
    return submission_df


### Baseline

In [10]:
evaluator = Evaluator(
    **Model(train_df, "lgbm", "rgr").get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.748359,0.75767,0.811247,0.783544,0.817951,0.347657,
2,0.747108,0.745273,0.814114,0.778174,0.824822,0.346127,
3,0.746171,0.748076,0.819562,0.782189,0.823146,0.348788,
4,0.75086,0.77521,0.8,0.78741,0.819599,0.345038,
mean,0.748124,0.756557,0.811231,0.782829,0.82138,0.346902,0.847296


In [11]:
evaluator = Evaluator(
    **Model(train_df, "xgb", "rgr").get_model()
)
evaluator.run()



Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.731479,0.736975,0.811247,0.77233,0.800214,0.366041,
2,0.732729,0.724469,0.822146,0.770223,0.806941,0.364682,
3,0.733667,0.734921,0.815065,0.772921,0.802666,0.368783,
4,0.738668,0.758585,0.802168,0.779768,0.801148,0.36577,
mean,0.734136,0.738738,0.812656,0.773811,0.802742,0.366319,0.758284


In [13]:
evaluator = Evaluator(
    **Model(train_df, "rf", "rgr").get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.780244,0.78992,0.829065,0.809019,0.861374,0.325817,
2,0.784933,0.780436,0.842226,0.810155,0.86698,0.328781,
3,0.772741,0.77949,0.824621,0.80142,0.853158,0.332766,
4,0.77743,0.799894,0.81897,0.80932,0.85545,0.328543,
mean,0.778837,0.787435,0.82872,0.807478,0.859241,0.328977,1.0


In [14]:
evaluator = Evaluator(
    **Model(train_df, "et", "rgr").get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.778681,0.793103,0.819599,0.806134,0.867521,0.293004,
2,0.786183,0.786681,0.83362,0.809471,0.873315,0.295177,
3,0.770866,0.780579,0.817875,0.798792,0.864925,0.301132,
4,0.78462,0.810419,0.817886,0.814135,0.867699,0.29387,
mean,0.780088,0.792696,0.822245,0.807133,0.868365,0.295796,1.0


In [15]:
evaluator = Evaluator(
    **Model(train_df, "cat", "rgr").get_model()
)
evaluator.run()

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.760863,0.768629,0.821269,0.794078,0.838506,0.336506,
2,0.77493,0.766823,0.843373,0.803279,0.84182,0.334413,
3,0.760863,0.76324,0.826307,0.793522,0.835118,0.338622,
4,0.765239,0.784304,0.817886,0.800743,0.831804,0.335227,
mean,0.765474,0.770749,0.827209,0.797905,0.836812,0.336192,0.921772


## Preprocessing

### Normalization

In [None]:
class MyEvaluator(Evaluator):
  def __init__(self, **kwargs):
    Evaluator.__init__(self, **kwargs)

  def featureNormalize(self, df):
    

  def preprocess(self, df=None, mode="index"):
    assert mode in ("index", "drop")

    df = self.train_df if df is None else df

    if mode == "drop":
      return self.drop_col(df)
    elif mode == "index":
      df = self.index_col(df)

    


In [24]:
for i in range(7):
  print(train_df.describe().loc["max"][i*10:(i+1)*10])

index    14999.0
Q1           5.0
Q2           5.0
Q3           5.0
Q4           5.0
Q5           5.0
Q6           5.0
Q7           5.0
Q8           5.0
Q9           5.0
Name: max, dtype: float64
Q10    5.0
Q11    5.0
Q12    5.0
Q13    5.0
Q14    5.0
Q15    5.0
Q16    5.0
Q17    5.0
Q18    5.0
Q19    5.0
Name: max, dtype: float64
Q20                    5.0
Q21                    5.0
Q22                    5.0
Q23                    5.0
Q24                    5.0
Q25                    5.0
Q26                    5.0
introelapse       855030.0
testelapse        474572.0
surveyelapse    15166994.0
Name: max, dtype: float64
TIPI1     5.0
TIPI2     5.0
TIPI3     5.0
TIPI4     5.0
TIPI5     5.0
TIPI6     5.0
TIPI7     5.0
TIPI8     5.0
TIPI9     5.0
TIPI10    5.0
Name: max, dtype: float64
VCL1     1.0
VCL2     1.0
VCL3     1.0
VCL4     1.0
VCL5     1.0
VCL6     1.0
VCL7     1.0
VCL8     1.0
VCL9     1.0
VCL10    1.0
Name: max, dtype: float64
VCL11        1.0
VCL12        1.0
VCL13        1.0

In [28]:
train_df.sort_values(by=["age"], ascending=False).head(20)

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness,country_idx
9273,9273,5.0,3.0,1.0,5.0,4.0,3.0,3.0,4.0,5.0,...,38822,1.0,2.0,3.0,2.0,1.0,3.0,2.0,1,42
6725,6725,1.0,1.0,3.0,3.0,5.0,3.0,5.0,1.0,5.0,...,722,2.0,2.0,1.0,1.0,2.0,3.0,2.0,0,42
4976,4976,4.0,5.0,4.0,3.0,4.0,4.0,4.0,4.0,5.0,...,545,1.0,2.0,1.0,1.0,2.0,4.0,2.0,0,90
14135,14135,4.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0,...,336,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1,73
4002,4002,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,...,123,1.0,2.0,4.0,2.0,1.0,3.0,2.0,0,124
1739,1739,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,...,123,1.0,2.0,4.0,2.0,1.0,3.0,2.0,0,124
1019,1019,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,100,3.0,10.0,5.0,1.0,1.0,2919.0,1.0,0,42
12092,12092,5.0,5.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,...,99,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1,42
6526,6526,1.0,5.0,,5.0,5.0,5.0,4.0,3.0,5.0,...,88,1.0,4.0,1.0,2.0,2.0,3.0,2.0,1,42
12574,12574,1.0,1.0,5.0,1.0,5.0,5.0,5.0,1.0,5.0,...,81,1.0,6.0,1.0,2.0,2.0,1.0,2.0,0,90


In [29]:
train_df.sort_values(by=["familysize"], ascending=False).head(20)

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness,country_idx
1019,1019,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,100,3.0,10.0,5.0,1.0,1.0,2919.0,1.0,0,42
1287,1287,5.0,4.0,5.0,3.0,3.0,4.0,4.0,5.0,2.0,...,15,1.0,7.0,2.0,2.0,1.0,39.0,2.0,1,125
9817,9817,5.0,4.0,5.0,3.0,3.0,4.0,4.0,5.0,2.0,...,15,1.0,7.0,2.0,2.0,1.0,39.0,2.0,1,125
474,474,5.0,5.0,5.0,5.0,4.0,4.0,4.0,5.0,5.0,...,23,1.0,2.0,2.0,1.0,1.0,23.0,2.0,1,42
11274,11274,5.0,5.0,5.0,5.0,4.0,4.0,4.0,5.0,5.0,...,23,1.0,2.0,2.0,1.0,1.0,23.0,2.0,1,42
5646,5646,4.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,...,13,1.0,12.0,5.0,1.0,2.0,19.0,2.0,1,16
6603,6603,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,4.0,...,32,1.0,7.0,3.0,2.0,1.0,17.0,2.0,0,42
6061,6061,3.0,5.0,1.0,3.0,4.0,3.0,,2.0,3.0,...,65,1.0,7.0,1.0,2.0,2.0,16.0,2.0,0,42
8552,8552,3.0,5.0,4.0,3.0,4.0,5.0,4.0,4.0,4.0,...,50,1.0,5.0,1.0,1.0,2.0,14.0,2.0,0,111
4696,4696,5.0,3.0,4.0,5.0,5.0,5.0,4.0,1.0,5.0,...,26,1.0,12.0,5.0,2.0,1.0,14.0,2.0,1,124
