# NN

## Requirements

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras import Input
from keras.layers import Dense
from keras import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
test = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

## Data Preprocessing

### Processing

In [4]:
train_x = train.copy().iloc[:, :-1]
train_y = train.copy()['nerdiness']


def onehot_encoder(dataframe, target, encoder=None):
    if encoder == None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(dataframe[[target]])
    ohe_df = encoder.transform(dataframe[[target]])
    tag_function = lambda x: f'{target}_{x}'
    ohe_df = pd.DataFrame(ohe_df, columns=list(map(tag_function, encoder.categories_[0])))
    results = pd.concat([dataframe, ohe_df], axis=1)
    
    return results, encoder


def preprocess(x, copy=True, encoders=None, scaler=None):
    if copy:
        x = x.copy()

    # cast
    int_list = [
        'VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8',
        'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16',
        'urban', 'age'
    ]
    for col in int_list:
        x[col] = x[col].astype(float)

    # isnull (missing values)
    null_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'education', 'gender', 'familysize']
    for col in null_list:
        x[f'{col}_isnull'] = np.where(pd.isnull(x[col]), 1.0, 0.0)

    # mean (missing values)
    mean_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'familysize']
    for col in mean_list:
        x[col][pd.isnull(x[col])] = x[col].mean()

    # value (missing values)
    value_dict = {
        'country': 'None',
        'education': 1.5,
        'gender': 0,
        'engnat': 0,
        'hand': 1,
        'religion': 0,
        'orientation': 0,
        'voted': 0,
        'married': 0,
        'ASD': 0}
    for col in value_dict:
        x[col][pd.isnull(x[col])] = value_dict[col]

    # value (0 values)
    value0_dict = {
        'urban': 1.5
    }
    for col in value0_dict:
        x[f'{col}_isnull'] = np.where(x[col] == 0, 1.0, 0.0)
        x[col][x[col] == 0] = value0_dict[col]

    # OneHotEncoder
    ohe_list = [
        'country', 'gender', 'engnat', 'hand', 'religion',
        'orientation', 'voted', 'married', 'ASD']
    if encoders == None:
        encoders = dict()
    for col in ohe_list:
        if col in encoders:
            x, encoders[col] = onehot_encoder(x, col, encoder=encoders[col])
        else:
            x, encoders[col] = onehot_encoder(x, col)

    # log (outlier)
    log_list = [
        'introelapse', 'testelapse', 'surveyelapse'
    ]
    for col in log_list:
        x[f'log_{col}'] = np.log(x[col])

    # min-max cut (outlier)
    outlier_dict = {
        'age': {'min': 1, 'max': 100},
        'log_introelapse': {'min': 0, 'max': 12},
        'log_testelapse': {'min': 3, 'max': 8},
        'log_surveyelapse': {'min': 0, 'max': 10},
        'familysize': {'min': 1, 'max': 40}}
    for col in outlier_dict:
        x[col][x[col] > outlier_dict[col]['max']] = outlier_dict[col]['max']
        x[col][x[col] < outlier_dict[col]['min']] = outlier_dict[col]['min']

    # drop
    drop_list = [
        'index'
    ] + log_list + ohe_list
    for col in drop_list:
        x = x.drop(columns=col)

    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x), columns=x.columns, index=list(x.index.values))

    return x, encoders, scaler

train_x, encoders, scaler = preprocess(train_x, copy=False)
test_x, _, _ = preprocess(test, encoders=encoders, scaler=scaler)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

### Informations

In [9]:
print(train_x.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 14999
Columns: 276 entries, Q1 to log_surveyelapse
dtypes: float64(276)
memory usage: 31.7 MB
None


In [10]:
print(train_x.describe())

                 Q1            Q2            Q3            Q4            Q5  \
count  15000.000000  15000.000000  15000.000000  15000.000000  15000.000000   
mean       0.743215      0.766426      0.805702      0.698875      0.714126   
std        0.273604      0.291747      0.255407      0.304262      0.298527   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.500000      0.750000      0.750000      0.500000      0.500000   
50%        0.750000      0.750000      1.000000      0.750000      0.750000   
75%        1.000000      1.000000      1.000000      1.000000      1.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

                 Q6            Q7            Q8            Q9           Q10  \
count  15000.000000  15000.000000  15000.000000  15000.000000  15000.000000   
mean       0.669091      0.795815      0.719753      0.728854      0.777649   
std        0.292278      0.264267      0.324999    

In [11]:
print(test_x.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35452 entries, 0 to 35451
Columns: 276 entries, Q1 to log_surveyelapse
dtypes: float64(276)
memory usage: 74.9 MB
None


In [12]:
print(test_x.describe())

                 Q1            Q2            Q3            Q4            Q5  \
count  35452.000000  35452.000000  35452.000000  35452.000000  35452.000000   
mean       0.744651      0.768347      0.808667      0.701814      0.718363   
std        0.272120      0.292518      0.253623      0.302281      0.295008   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.500000      0.750000      0.750000      0.500000      0.500000   
50%        0.750000      0.750000      1.000000      0.750000      0.750000   
75%        1.000000      1.000000      1.000000      1.000000      1.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

                 Q6            Q7            Q8            Q9           Q10  \
count  35452.000000  35452.000000  35452.000000  35452.000000  35452.000000   
mean       0.674781      0.798180      0.722011      0.734893      0.779795   
std        0.292092      0.264636      0.325888    

## Model

In [5]:
train_x.shape

(15000, 276)

In [1]:
inputs = Input(shape=(None, train.shape[1]))
x = inputs
for unit in [1024, 512, 256, 128, 64, 32, 16]:
    x = Dense(1024, activation='relu')(x)
output = x = Dense(1, activation='softmax')
model = Model(inputs, output)
model.summary()

NameError: name 'Input' is not defined