# Ensemble

## Requirements

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split

from xgboost import XGBClassifier

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('mode.chained_assignment',  None)

In [3]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
target = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

## Data Preprocessing

In [4]:
X_train = train.copy().iloc[:, :-1]
y_train = train.copy()['nerdiness']


def onehot_encoder(dataframe, target, encoder=None):
    if encoder == None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(dataframe[[target]])
    ohe_df = encoder.transform(dataframe[[target]])
    tag_function = lambda x: f'{target}_{x}'
    ohe_df = pd.DataFrame(ohe_df, columns=list(map(tag_function, encoder.categories_[0])))
    results = pd.concat([dataframe, ohe_df], axis=1)
    
    return results, encoder


def preprocess(x, copy=True, encoders=None, scaler=None):
    if copy:
        x = x.copy()

    # cast
    int_list = [
        'VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8',
        'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16',
        'urban', 'age'
    ]
    for col in int_list:
        x[col] = x[col].astype(float)

    # isnull (missing values)
    null_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'education', 'gender', 'familysize']
    for col in null_list:
        x[f'{col}_isnull'] = np.where(pd.isnull(x[col]), 1.0, 0.0)

    # mean (missing values)
    mean_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'familysize']
    for col in mean_list:
        x[col][pd.isnull(x[col])] = x[col].mean()

    # value (missing values)
    value_dict = {
        'country': 'None',
        'education': 1.5,
        'gender': 0,
        'engnat': 0,
        'hand': 1,
        'religion': 0,
        'orientation': 0,
        'voted': 0,
        'married': 0,
        'ASD': 0}
    for col in value_dict:
        x[col][pd.isnull(x[col])] = value_dict[col]

    # value (0 values)
    value0_dict = {
        'urban': 1.5
    }
    for col in value0_dict:
        x[f'{col}_isnull'] = np.where(x[col] == 0, 1.0, 0.0)
        x[col][x[col] == 0] = value0_dict[col]

    # OneHotEncoder
    ohe_list = [
        'country', 'gender', 'engnat', 'hand', 'religion',
        'orientation', 'voted', 'married', 'ASD']
    if encoders == None:
        encoders = dict()
    for col in ohe_list:
        if col in encoders:
            x, encoders[col] = onehot_encoder(x, col, encoder=encoders[col])
        else:
            x, encoders[col] = onehot_encoder(x, col)

    # log (outlier)
    log_list = [
        'introelapse', 'testelapse', 'surveyelapse'
    ]
    for col in log_list:
        x[f'log_{col}'] = np.log(x[col] + 1e-08)

    # min-max cut (outlier)
    outlier_dict = {
        'age': {'min': 1, 'max': 100},
        'log_introelapse': {'min': 0, 'max': 12},
        'log_testelapse': {'min': 3, 'max': 8},
        'log_surveyelapse': {'min': 0, 'max': 10},
        'familysize': {'min': 1, 'max': 40}}
    for col in outlier_dict:
        x[col][x[col] > outlier_dict[col]['max']] = outlier_dict[col]['max']
        x[col][x[col] < outlier_dict[col]['min']] = outlier_dict[col]['min']

    # drop
    drop_list = [
        'index', 
        'country_AGO', 'country_ALA', 'country_ARM', 'country_AZE',
        'country_BHS', 'country_BLR', 'country_BRB', 'country_BRN',
        'country_BWA', 'country_DOM', 'country_ETH', 'country_FRO',
        'country_GRL', 'country_GTM', 'country_GUF', 'country_GUY',
        'country_IRQ', 'country_KAZ', 'country_KHM', 'country_LBY',
        'country_LUX', 'country_MAC', 'country_MDV', 'country_MNP',
        'country_MOZ', 'country_MUS', 'country_MWI', 'country_NAM',
        'country_NPL', 'country_OMN', 'country_PAN', 'country_SDN',
        'country_SSD', 'country_TUN', 'country_UGA', 'country_VGB',
        'country_VIR', 'country_FSM', 'country_GEO', 'country_PNG',
        'country_RWA', 'country_SYR', 'country_LAO', 'country_MNG',
        'country_CUW', 'country_MLT', 'country_BHR', 'country_MDG'
    ] + log_list + ohe_list
    for col in drop_list:
        x = x.drop(columns=col)

    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x), columns=x.columns, index=list(x.index.values))

    return x, encoders, scaler

X_train, encoders, scaler = preprocess(X_train, copy=False)
X_target, _, _ = preprocess(target, encoders=encoders, scaler=scaler)

## Model

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

### RandomForestClassifier

In [6]:
n_estimators = 2048
criterion = 'entropy'
max_depth = 200
min_samples_split = 3
min_samples_leaf = 1
min_weight_fraction_leaf = 0.0
max_features = 'log2'
bootstrap = False
class_weight = None

rfc = RandomForestClassifier(
    n_estimators=n_estimators,
    criterion=criterion,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    min_weight_fraction_leaf=min_weight_fraction_leaf,
    max_features=max_features,
    bootstrap=bootstrap,
    class_weight=class_weight,
    n_jobs=-1
)

0.8957459642672488

### ExtraTreeClassifier

In [7]:
n_estimators = 2048
criterion = 'entropy'
max_depth = 200
min_samples_split = 3
min_samples_leaf = 1
min_weight_fraction_leaf = 0.0
max_features = 'sqrt'
bootstrap = False
class_weight = 'balanced_subsample'

etc = ExtraTreesClassifier(
    n_estimators=n_estimators,
    criterion=criterion,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    min_weight_fraction_leaf=min_weight_fraction_leaf,
    max_features=max_features,
    bootstrap=bootstrap,
    class_weight=class_weight,
    n_jobs=-1
)

0.895223545125524

### XGBoostClassifier

In [8]:
n_estimators = 16 #
booster = 'gbtree' #
learning_rate = 0.1
min_split_loss = 0
max_depth = 20
min_child_weight = 1
max_delta_step = 0
subsample = 0.7 #
sampling_method = 'uniform' #
colsample_bytree = 1
colsample_bylevel = 1
colsample_bynode = 1
sketch_eps = 0.03
scale_pos_weight = 1
refresh_leaf = 1
grow_policy = 'depthwise' #
max_leaves = 0
max_bin = 256 #
num_parallel_tree = 16 #
objective = 'reg:squarederror' #
tree_method='gpu_hist' #
gpu_id=0 #

xgb = XGBClassifier(
    n_estimators=n_estimators,
    booster=booster,
    learning_rate=learning_rate,
    min_split_loss=min_split_loss,
    max_depth=max_depth,
    min_child_weight=min_child_weight,
    max_delta_step=max_delta_step,
    subsample=subsample,
    sampling_method=sampling_method,
    colsample_bytree=colsample_bytree,
    colsample_bylevel=colsample_bylevel,
    colsample_bynode=colsample_bynode,
    sketch_eps=sketch_eps,
    scale_pos_weight=scale_pos_weight,
    refresh_leaf=refresh_leaf,
    grow_policy=grow_policy,
    max_leaves=max_leaves,
    max_bin=max_bin,
    num_parallel_tree=num_parallel_tree,
    objective=objective,
    tree_method=tree_method,
    gpu_id=gpu_id
)

### Ensemble

In [9]:
estimators = [('rtc', rfc),('etc', etc)]

clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

clf.fit(X_train, y_train)

StackingClassifier(estimators=[('rtc',
                                RandomForestClassifier(bootstrap=False,
                                                       criterion='entropy',
                                                       max_depth=200,
                                                       max_features='log2',
                                                       min_samples_split=3,
                                                       n_estimators=2048,
                                                       n_jobs=-1)),
                               ('etc',
                                ExtraTreesClassifier(class_weight='balanced_subsample',
                                                     criterion='entropy',
                                                     max_depth=200,
                                                     max_features='sqrt',
                                                     min_samples_split=3,
                                

In [10]:
predictions = clf.predict_proba(X_target)

In [13]:
predictions

array([[0.97195323, 0.02804677],
       [0.09136011, 0.90863989],
       [0.09477749, 0.90522251],
       ...,
       [0.04720648, 0.95279352],
       [0.96097567, 0.03902433],
       [0.28483593, 0.71516407]])

In [14]:
submission['nerdiness'] = predictions[:, 1]

In [15]:
submission.to_csv(outputs + '/ensemble_submission.csv', index=False)