# Ensemble

## Requirements

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split

from xgboost import XGBClassifier

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('mode.chained_assignment',  None)

In [2]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
target = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

## Data Preprocessing

In [3]:
X_train = train.copy().iloc[:, :-1]
y_train = train.copy()['nerdiness']


def onehot_encoder(dataframe, target, encoder=None):
    if encoder == None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(dataframe[[target]])
    ohe_df = encoder.transform(dataframe[[target]])
    tag_function = lambda x: f'{target}_{x}'
    ohe_df = pd.DataFrame(ohe_df, columns=list(map(tag_function, encoder.categories_[0])))
    results = pd.concat([dataframe, ohe_df], axis=1)
    
    return results, encoder


def preprocess(x, copy=True, encoders=None, scaler=None):
    if copy:
        x = x.copy()

    # cast
    int_list = [
        'VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8',
        'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16',
        'urban', 'age'
    ]
    for col in int_list:
        x[col] = x[col].astype(float)

    # isnull (missing values)
    null_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'education', 'gender', 'familysize']
    for col in null_list:
        x[f'{col}_isnull'] = np.where(pd.isnull(x[col]), 1.0, 0.0)

    # mean (missing values)
    mean_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'familysize']
    for col in mean_list:
        x[col][pd.isnull(x[col])] = x[col].mean()

    # value (missing values)
    value_dict = {
        'country': 'None',
        'education': 1.5,
        'gender': 0,
        'engnat': 0,
        'hand': 1,
        'religion': 0,
        'orientation': 0,
        'voted': 0,
        'married': 0,
        'ASD': 0}
    for col in value_dict:
        x[col][pd.isnull(x[col])] = value_dict[col]

    # value (0 values)
    value0_dict = {
        'urban': 1.5
    }
    for col in value0_dict:
        x[f'{col}_isnull'] = np.where(x[col] == 0, 1.0, 0.0)
        x[col][x[col] == 0] = value0_dict[col]

    # OneHotEncoder
    ohe_list = [
        'country', 'gender', 'engnat', 'hand', 'religion',
        'orientation', 'voted', 'married', 'ASD']
    if encoders == None:
        encoders = dict()
    for col in ohe_list:
        if col in encoders:
            x, encoders[col] = onehot_encoder(x, col, encoder=encoders[col])
        else:
            x, encoders[col] = onehot_encoder(x, col)

    # log (outlier)
    log_list = [
        'introelapse', 'testelapse', 'surveyelapse'
    ]
    for col in log_list:
        x[f'log_{col}'] = np.log(x[col] + 1e-08)

    # min-max cut (outlier)
    outlier_dict = {
        'age': {'min': 1, 'max': 100},
        'log_introelapse': {'min': 0, 'max': 12},
        'log_testelapse': {'min': 3, 'max': 8},
        'log_surveyelapse': {'min': 0, 'max': 10},
        'familysize': {'min': 1, 'max': 40}}
    for col in outlier_dict:
        x[col][x[col] > outlier_dict[col]['max']] = outlier_dict[col]['max']
        x[col][x[col] < outlier_dict[col]['min']] = outlier_dict[col]['min']

    # drop
    drop_list = [
        'index', 
        'country_AGO', 'country_ALA', 'country_ARM', 'country_AZE',
        'country_BHS', 'country_BLR', 'country_BRB', 'country_BRN',
        'country_BWA', 'country_DOM', 'country_ETH', 'country_FRO',
        'country_GRL', 'country_GTM', 'country_GUF', 'country_GUY',
        'country_IRQ', 'country_KAZ', 'country_KHM', 'country_LBY',
        'country_LUX', 'country_MAC', 'country_MDV', 'country_MNP',
        'country_MOZ', 'country_MUS', 'country_MWI', 'country_NAM',
        'country_NPL', 'country_OMN', 'country_PAN', 'country_SDN',
        'country_SSD', 'country_TUN', 'country_UGA', 'country_VGB',
        'country_VIR', 'country_FSM', 'country_GEO', 'country_PNG',
        'country_RWA', 'country_SYR', 'country_LAO', 'country_MNG',
        'country_CUW', 'country_MLT', 'country_BHR', 'country_MDG'
    ] + log_list + ohe_list
    for col in drop_list:
        x = x.drop(columns=col)

    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x), columns=x.columns, index=list(x.index.values))

    return x, encoders, scaler

X_train, encoders, scaler = preprocess(X_train, copy=False)
X_target, _, _ = preprocess(target, encoders=encoders, scaler=scaler)

## Model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

### RandomForestClassifier

In [6]:
rfc = RandomForestClassifier()

"""
n_estimators: int, default=100
criterion: {'gini', 'entropy', 'log_loss'}, default='gini'
max_depth: int, default=None
min_samples_split: int or float, default=2
min_samples_leaf: int or float, default=1
min_weight_fraction_leaf: float, default=0.0
max_features: {'sqrt', 'log2', None}, int or float, default='sqrt'
max_leaf_nodes: int, default=None
min_impurity_decrease: float, default=0.0
bootstrap: bool, default=False
oob_score: bool, default=False
n_jobs: int, default=None
class_weight: {'balanced', 'balanced_subsample'}, dict or list of dicts, default=None
"""
grid_parameters = {
    'n_estimators': [1024, 2048, 4096],
    'criterion': ['entropy'],
    'max_depth': [200],
    'min_samples_split': [3],
    'min_samples_leaf': [1],
    'min_weight_fraction_leaf': [0.0],
    'max_features': ['log2'],
    'bootstrap': [False],
    'n_jobs': [4],
    'class_weight': [None],
}

grid_rfc = GridSearchCV(rfc, param_grid=grid_parameters, scoring='roc_auc', cv=5, refit=False)
grid_rfc.fit(X_train, y_train)
pd.DataFrame(grid_rfc.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_min_weight_fraction_leaf,param_n_estimators,param_n_jobs,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,31.602699,1.167936,5.0721,1.942337,False,,entropy,200,log2,1,3,0.0,4096,4,"{'bootstrap': False, 'class_weight': None, 'cr...",0.877757,0.86963,0.866255,0.870623,0.885772,0.874008,0.006974,1
0,8.4877,1.073181,0.3264,0.0002,False,,entropy,200,log2,1,3,0.0,1024,4,"{'bootstrap': False, 'class_weight': None, 'cr...",0.877681,0.867912,0.865935,0.870162,0.885975,0.873533,0.007385,2
1,15.4056,0.388528,0.738299,0.0434,False,,entropy,200,log2,1,3,0.0,2048,4,"{'bootstrap': False, 'class_weight': None, 'cr...",0.877054,0.868837,0.866043,0.869887,0.885663,0.873497,0.007085,3


### ExtraTreesClassifier

In [10]:
etc = ExtraTreesClassifier()

"""
n_estimators: int, default=100
criterion: {'gini', 'entropy', 'log_loss'}, default='gini'
max_depth: int, default=None
min_samples_split: int or float, default=2
min_samples_leaf: int or float, default=1
min_weight_fraction_leaf: float, default=0.0
max_features: {'sqrt', 'log2', None}, int or float, default='sqrt'
max_leaf_nodes: int, default=None
min_impurity_decrease: float, default=0.0
bootstrap: bool, default=False
oob_score: bool, default=False
n_jobs: int, default=None
class_weight: {'balanced', 'balanced_subsample'}, dict or list of dicts, default=None
"""
grid_parameters = {
    'n_estimators': [512, 1024, 2048, 4096],
    'criterion': ['entropy'],
    'max_depth': [200],
    'min_samples_split': [3],
    'min_samples_leaf': [1],
    'min_weight_fraction_leaf': [0.0],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'n_jobs': [4],
    'class_weight': ['balanced_subsample'],
}

grid_etc = GridSearchCV(rfc, param_grid=grid_parameters, scoring='roc_auc', cv=5, refit=False)
grid_etc.fit(X_train, y_train)
pd.DataFrame(grid_etc.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_min_weight_fraction_leaf,param_n_estimators,param_n_jobs,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,8.499795,0.033387,0.455917,0.043411,False,balanced_subsample,entropy,200,sqrt,1,3,0.0,1024,4,"{'bootstrap': False, 'class_weight': 'balanced...",0.872115,0.861581,0.857674,0.862496,0.883306,0.867434,0.009249,1
2,21.648301,1.153732,1.231941,0.085491,False,balanced_subsample,entropy,200,sqrt,1,3,0.0,2048,4,"{'bootstrap': False, 'class_weight': 'balanced...",0.872435,0.860788,0.858016,0.862003,0.883221,0.867293,0.009343,2
3,53.244721,13.988981,2.63911,1.016552,False,balanced_subsample,entropy,200,sqrt,1,3,0.0,4096,4,"{'bootstrap': False, 'class_weight': 'balanced...",0.872189,0.860917,0.857696,0.862044,0.882377,0.867044,0.009072,3
0,4.888373,0.525822,0.262488,0.054702,False,balanced_subsample,entropy,200,sqrt,1,3,0.0,512,4,"{'bootstrap': False, 'class_weight': 'balanced...",0.871262,0.861932,0.857103,0.861021,0.883056,0.866875,0.00933,4


### XGBoostClassifier

In [27]:
xgb = XGBClassifier(
    tree_method='gpu_hist',
    gpu_id=0
)

"""
n_estimators: [1,∞], default=1
booster: {'gbtree', 'gblinear', 'dart'}, default='gbtree'
learning_rate: [0,1], default=0.3
min_split_loss: [0,∞], default=0
max_depth: [0,∞], default=6
min_child_weight: [0,∞], default=1
max_delta_step: [0,∞], default=0
subsample: (0,1], default=1
sampling_method: {'uniform', 'gradient_based'}, default=uniform
colsample_bytree: (0, 1], default=1
colsample_bylevel: (0, 1], default=1
colsample_bynode: (0, 1], default=1
sketch_eps: (0, 1), default=0.03
scale_pos_weight: default=1
updater: {'grow_colmaker', 'grow_histmaker', 'grow_local_histmaker', 'grow_quantile_histmaker',
          'grow_gpu_hist', 'sync', 'refresh', 'prune'}
refresh_leaf: {0, 1}, default=1
grow_policy: {'depthwise', 'lossguide'}, default='depthwise'
 - Only used if tree_method is set to hist, approx or gpu_hist.
max_leaves: default=0
max_bin: default=256
num_parallel_tree: default=1
 - Only used if tree_method is set to hist, approx or gpu_hist.
objective: {'reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror',
            'binary:logistic', 'binary:logitraw', 'binary:hinge'} default=reg:squarederror
 - Only used if tree_method is set to hist, approx or gpu_hist.

 * booster=dart
sample_type: {'uniform', 'weighted'}, default='uniform'
normalize_type: {'tree', 'forest'}, default='tree'
rate_drop: [0.0, 1.0], default=0.0
one_drop: {0, 1}, default=0
skip_drop: [0.0, 1.0], default=0.0

 * booster=gblinear
reg_lambda: default=1
reg_alpha: default=0
updater: {'shotgun', 'coord_descent'}, default='shotgun'
feature_selector: {'cyclic', 'shuffle', 'random', 'greedy', 'thrifty'}, default='cyclic'
top_k: default=0
 - Only used if feature_selector is 'greedy' or 'thrifty'

 * objective=reg:pseudohubererror
huber_slope: default=1.0
"""
grid_parameters = {
    'n_estimators': [16, 32],
    'booster': ['gbtree'],
    'learning_rate': [0.1],
    'min_split_loss': [0],
    'max_depth': [20, 21, 22],
    'min_child_weight': [1],
    'max_delta_step': [0],
    'subsample': [0.7],
    'sampling_method': ['uniform'],
    'colsample_bytree': [0.5],
    'colsample_bylevel': [1],
    'colsample_bynode': [1],
    'sketch_eps': [0.03],
    'scale_pos_weight': [1],
    'refresh_leaf': [1],
    'grow_policy': ['depthwise'],
    'max_leaves': [0, 2048, 4096],
    'max_bin': [256],
    'num_parallel_tree': [16],
    'objective': ['reg:squarederror'],
    # 'sample_type': ['uniform'],
    # 'normalize_type': ['tree'],
    # 'rate_drop': [0.05],
    # 'one_drop': [0],
    # 'skip_drop': [0.05],
    # 'updater': ['grow_gpu_hist'],
    # 'feature_selector': ['cyclic'],
    # 'reg_lambda': [1],
    # 'reg_alpha': [0],
    # 'top_k': [0],
    # 'huber_slope': [1.0]
}

grid_xgb = GridSearchCV(xgb, param_grid=grid_parameters, scoring='roc_auc', cv=5, refit=False)
grid_xgb.fit(X_train, y_train)
pd.DataFrame(grid_xgb.cv_results_).sort_values(by='rank_test_score')