# NN

## Requirements

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras import Input
from keras import Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import GaussianNoise
from keras.losses import BinaryCrossentropy
from keras.metrics import AUC
from keras.regularizers import L1L2
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('mode.chained_assignment',  None)

In [2]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
test = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

## Data Preprocessing

### Processing

In [3]:
train_x = train.copy().iloc[:, :-1]
train_y = train.copy()['nerdiness']


def onehot_encoder(dataframe, target, encoder=None):
    if encoder == None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(dataframe[[target]])
    ohe_df = encoder.transform(dataframe[[target]])
    tag_function = lambda x: f'{target}_{x}'
    ohe_df = pd.DataFrame(ohe_df, columns=list(map(tag_function, encoder.categories_[0])))
    results = pd.concat([dataframe, ohe_df], axis=1)
    
    return results, encoder


def preprocess(x, copy=True, encoders=None, scaler=None):
    if copy:
        x = x.copy()

    # cast
    int_list = [
        'VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8',
        'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16',
        'urban', 'age'
    ]
    for col in int_list:
        x[col] = x[col].astype(float)

    # isnull (missing values)
    null_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'education', 'gender', 'familysize']
    for col in null_list:
        x[f'{col}_isnull'] = np.where(pd.isnull(x[col]), 1.0, 0.0)

    # mean (missing values)
    mean_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'familysize']
    for col in mean_list:
        x[col][pd.isnull(x[col])] = x[col].mean()

    # value (missing values)
    value_dict = {
        'country': 'None',
        'education': 1.5,
        'gender': 0,
        'engnat': 0,
        'hand': 1,
        'religion': 0,
        'orientation': 0,
        'voted': 0,
        'married': 0,
        'ASD': 0}
    for col in value_dict:
        x[col][pd.isnull(x[col])] = value_dict[col]

    # value (0 values)
    value0_dict = {
        'urban': 1.5
    }
    for col in value0_dict:
        x[f'{col}_isnull'] = np.where(x[col] == 0, 1.0, 0.0)
        x[col][x[col] == 0] = value0_dict[col]

    # OneHotEncoder
    ohe_list = [
        'country', 'gender', 'engnat', 'hand', 'religion',
        'orientation', 'voted', 'married', 'ASD']
    if encoders == None:
        encoders = dict()
    for col in ohe_list:
        if col in encoders:
            x, encoders[col] = onehot_encoder(x, col, encoder=encoders[col])
        else:
            x, encoders[col] = onehot_encoder(x, col)

    # log (outlier)
    log_list = [
        'introelapse', 'testelapse', 'surveyelapse'
    ]
    for col in log_list:
        x[f'log_{col}'] = np.log(x[col] + 1e-08)

    # min-max cut (outlier)
    outlier_dict = {
        'age': {'min': 1, 'max': 100},
        'log_introelapse': {'min': 0, 'max': 12},
        'log_testelapse': {'min': 3, 'max': 8},
        'log_surveyelapse': {'min': 0, 'max': 10},
        'familysize': {'min': 1, 'max': 40}}
    for col in outlier_dict:
        x[col][x[col] > outlier_dict[col]['max']] = outlier_dict[col]['max']
        x[col][x[col] < outlier_dict[col]['min']] = outlier_dict[col]['min']

    # drop
    drop_list = [
        'index', 
        'country_AGO', 'country_ALA', 'country_ARM', 'country_AZE',
        'country_BHS', 'country_BLR', 'country_BRB', 'country_BRN',
        'country_BWA', 'country_DOM', 'country_ETH', 'country_FRO',
        'country_GRL', 'country_GTM', 'country_GUF', 'country_GUY',
        'country_IRQ', 'country_KAZ', 'country_KHM', 'country_LBY',
        'country_LUX', 'country_MAC', 'country_MDV', 'country_MNP',
        'country_MOZ', 'country_MUS', 'country_MWI', 'country_NAM',
        'country_NPL', 'country_OMN', 'country_PAN', 'country_SDN',
        'country_SSD', 'country_TUN', 'country_UGA', 'country_VGB',
        'country_VIR', 'country_FSM', 'country_GEO', 'country_PNG',
        'country_RWA', 'country_SYR', 'country_LAO', 'country_MNG',
        'country_CUW', 'country_MLT', 'country_BHR', 'country_MDG'
    ] + log_list + ohe_list
    for col in drop_list:
        x = x.drop(columns=col)

    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x), columns=x.columns, index=list(x.index.values))

    return x, encoders, scaler

train_x, encoders, scaler = preprocess(train_x, copy=False)
test_x, _, _ = preprocess(test, encoders=encoders, scaler=scaler)

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Model

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

grid_parameters = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [2, 4],
    'min_weight_fraction_leaf': [0.0, 1e-5],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'n_jobs': [8],
    'class_weight': ['balanced', 'balanced_subsample', None],
}

grid_rfc = GridSearchCV(model, param_grid=grid_parameters, cv=5)
grid_rfc.fit(train_x, train_y)