# 【第1回_Beginner限定コンペ】銀行の顧客ターゲティング

顧客の属性情報などから定期預金キャンペーンの反応率を予測しよう。

https://signate.jp/competitions/292

Neural Network編

## ライブラリインポート

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna
#from bayes_opt import BayesianOptimization

import tensorflow as tf

from tensorflow.keras import regularizers
from tensorflow.keras import losses
from tensorflow.keras import backend as K
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Activation, Dense, Dropout, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.metrics import AUC
from tensorflow.keras.optimizers import Adam

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
# Random Seed 固定

np.random.seed(22)
tf.random.set_seed(22)


## 関数

In [3]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
def df_stats(df):
    stats = []
    for col in df.columns:
        stats.append((col,
                      df[col].nunique(),
                      df[col].value_counts().index[0],
                      df[col].value_counts().values[0],
                      df[col].isnull().sum() * 100 / df.shape[0],
                      df[col].value_counts(normalize=True, dropna=False).values[0] * 100,
                      df[col].dtype))
    return pd.DataFrame(stats, columns=['カラム名', 'カラムごとのユニーク値数', '最も出現頻度の高い値', '最も出現頻度の高い値の出現回数', '欠損損値の割合', '最も多いカテゴリの割合', 'Type'])

## データ読み込み・前処理

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submit_df = pd.read_csv('submit_sample.csv', header=None)

# Pseudo Labeling
test_p1_df = pd.read_csv('test_p1.csv')

In [6]:
# Pseudo Labeling
train_df = pd.concat([train_df, test_p1_df])

In [7]:
df_list = [train_df, test_df]

for df in df_list:
    df['job'] = df['job'].map({'unknown': 1, 'technician': 2, 'blue-collar': 3, 'services': 4, 'entrepreneur': 5, 'admin.': 6, 'management': 7, 'housemaid': 8, 'self-employed': 9, 'unemployed': 10, 'retired': 11, 'student': 12})
    
    df['marital'] = df['marital'].map({'married': 2, 'divorced':1, 'single': 0})
    #df.drop(['marital'], axis=1, inplace=True)

    df['education'] = df['education'].map({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 0})
    #df.drop(['education'], axis=1, inplace=True)

    #df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df.drop(['default'], axis=1, inplace=True)

    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    #df.drop(['housing'], axis=1, inplace=True)
    
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})
    #df.drop(['loan'], axis=1, inplace=True)

    df['contact'] = df['contact'].map({'telephone': 2, 'cellular': 1, 'unknown': 0})
    #df.drop(['contact'], axis=1, inplace=True)

    df['poutcome'] = df['poutcome'].map({'success': 3, 'unknown': 2, 'failure': 1, 'other': 0})
    #df['p_label_mean'] = np.log(df['poutcome'].map(p_label_mean))
    #df.drop(['poutcome'], axis=1, inplace=True)
    
    df['month'] = df['month'].map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
    #df.drop(['day', 'month'], axis=1, inplace=True)
    
    # Feb 30 とかあって、正確には変換できない
    # → データの Feb 30 を Mar 1 に変換した(他にも、 2/31, 6/31, 11/31)
    #df['dayofyear'] = df['month'] * 31 + df['day']
    df['datetime'] = pd.to_datetime('2012/' + df['month'].astype(str).str.pad(2,fillchar='0') + '/' + df['day'].astype(str).str.pad(2,fillchar='0'), format='%Y/%m/%d')
    df['dayofyear'] = df['datetime'].dt.dayofyear
    #df['dayofweek'] = df['datetime'].dt.dayofweek
    df.drop(['datetime'], axis=1, inplace=True)

    df['duration'] = np.log(df['duration'] + 1)

    #df['bpp'] = np.log((df['balance'] - df['balance'].min()) / (df['pdays'] + 2) + 1)
    #df['cdp'] = (df['campaign'] - df['previous']) * df['duration']
    
    df.drop(['pdays'], axis=1, inplace=True)
    df.drop(['balance'], axis=1, inplace=True)
    
    df.drop(['id'], axis=1, inplace=True)

In [8]:
y = train_df.pop('y')

In [9]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 5.17 MB
Memory usage after optimization is: 1.03 MB
Decreased by 80.0%
Memory usage of dataframe is 1.93 MB
Memory usage after optimization is: 0.28 MB
Decreased by 85.7%


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45150 entries, 0 to 18049
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        45150 non-null  int8   
 1   job        45150 non-null  int8   
 2   marital    45150 non-null  int8   
 3   education  45150 non-null  int8   
 4   housing    45150 non-null  int8   
 5   loan       45150 non-null  int8   
 6   contact    45150 non-null  int8   
 7   day        45150 non-null  int8   
 8   month      45150 non-null  int8   
 9   duration   45150 non-null  float16
 10  campaign   45150 non-null  int8   
 11  previous   45150 non-null  int8   
 12  poutcome   45150 non-null  int8   
 13  dayofyear  45150 non-null  int16  
dtypes: float16(1), int16(1), int8(12)
memory usage: 1.0 MB


In [11]:
train_df.describe()

Unnamed: 0,age,job,marital,education,housing,loan,contact,day,month,duration,campaign,previous,poutcome,dayofyear
count,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0,45150.0
mean,36.08567,5.139668,1.384563,2.043477,0.581107,0.127796,0.786401,16.699446,6.001617,inf,1.773533,0.087774,1.856633,168.562769
std,7.835323,2.662201,0.872828,0.723352,0.493383,0.333866,0.50108,8.577737,2.128473,0.7763672,0.947974,0.369888,0.465919,64.939227
min,20.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0
25%,31.0,3.0,0.0,2.0,0.0,0.0,0.0,8.0,5.0,4.804688,1.0,0.0,2.0,136.0
50%,33.0,5.0,2.0,2.0,1.0,0.0,1.0,17.0,5.0,5.070312,1.0,0.0,2.0,148.0
75%,37.0,7.0,2.0,2.0,1.0,0.0,1.0,26.0,7.0,5.847656,2.0,0.0,2.0,199.0
max,90.0,12.0,2.0,3.0,1.0,1.0,2.0,31.0,12.0,8.03125,5.0,3.0,3.0,336.0


In [12]:
df_stats(train_df)

Unnamed: 0,カラム名,カラムごとのユニーク値数,最も出現頻度の高い値,最も出現頻度の高い値の出現回数,欠損損値の割合,最も多いカテゴリの割合,Type
0,age,43,31.0,7532,0.0,16.682171,int8
1,job,12,3.0,10043,0.0,22.243632,int8
2,marital,3,2.0,29218,0.0,64.713178,int8
3,education,4,2.0,26754,0.0,59.255814,int8
4,housing,2,1.0,26237,0.0,58.110742,int8
5,loan,2,0.0,39380,0.0,87.220377,int8
6,contact,3,1.0,31754,0.0,70.330011,int8
7,day,30,27.0,6903,0.0,15.289037,int8
8,month,12,5.0,18818,0.0,41.678848,int8
9,duration,156,5.070312,9509,0.0,21.060908,float16


## 学習関数

In [13]:
class Objective():
    def __init__(self):
        self.best_boosters = None
        self._boosters = None

    def __call__(self, trial):
        dim_age = trial.suggest_int('dim_age', 1, 90)
        dim_job = trial.suggest_int('dim_job', 1, 12)
        dim_day = trial.suggest_int('dim_day', 1, 31)
        dim_duration = trial.suggest_int('dim_duration', 1, 10)
        dim_dayofyear = trial.suggest_int('dim_dayofyear', 10, 200)

        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)
        val_aucs = []
        self._boosters = []
        
        for train_index, val_index in kf.split(train_df, y):
            # 学習データ分割
            train_data = train_df.iloc[train_index]
            train_label = y.iloc[train_index]
            val_data = train_df.iloc[val_index]
            val_label = y.iloc[val_index]

            train_x = {
                "input_age": train_data[["age"]],
                "input_job": train_data[["job"]],
                "input_marital": train_data[["marital"]],
                'input_education': train_data[["education"]],
                "input_housing": train_data[["housing"]],
                "input_loan": train_data[["loan"]],
                "input_contact": train_data[["contact"]],
                "input_day": train_data[["day"]],
                "input_month": train_data[["month"]],
                "input_campaign": train_data[["campaign"]],
                "input_previous": train_data[["previous"]],
                "input_poutcome": train_data[["poutcome"]],
                "input_duration": train_data[["duration"]],
                "input_dayofyear": train_data[["dayofyear"]],
            #    "input_cdp": train_data[["cdp"]],
            }
            
            val_x = {
                "input_age": val_data[["age"]],
                "input_job": val_data[["job"]],
                "input_marital": val_data[["marital"]],
                'input_education': val_data[["education"]],
                "input_housing": val_data[["housing"]],
                "input_loan": val_data[["loan"]],
                "input_contact": val_data[["contact"]],
                "input_day": val_data[["day"]],
                "input_month": val_data[["month"]],
                "input_campaign": val_data[["campaign"]],
                "input_previous": val_data[["previous"]],
                "input_poutcome": val_data[["poutcome"]],
                "input_duration": val_data[["duration"]],
                "input_dayofyear": val_data[["dayofyear"]],
            #    "input_cdp": val_data[["cdp"]],
            }

            # Clear clutter from previous Keras session graphs.
            clear_session()

            # Entity Embedding
            input_age = Input(shape=(1,), name="input_age")
            embed_age = Embedding(91, dim_age, name="embed_age")(input_age)
            flatt_age = Flatten(name="flatt_age")(embed_age)

            input_job = Input(shape=(1,), name="input_job")
            embed_job = Embedding(13, dim_job, name="embed_job")(input_job)
            flatt_job = Flatten(name="flatt_job")(embed_job)

            input_marital = Input(shape=(1,), name="input_marital")
            embed_marital = Embedding(3, 2, name="embed_marital")(input_marital)
            flatt_marital = Flatten(name="flatt_marital")(embed_marital)

            input_education = Input(shape=(1,), name="input_education")
            embed_education = Embedding(4, 3, name="embed_education")(input_education)
            flatt_education = Flatten(name="flatt_education")(embed_education)

            input_housing = Input(shape=(1,), name="input_housing")
            embed_housing = Embedding(2, 4, name="embed_housing")(input_housing)
            flatt_housing = Flatten(name="flatt_housing")(embed_housing)

            input_loan = Input(shape=(1,), name="input_loan")
            embed_loan = Embedding(2, 4, name="embed_loan")(input_loan)
            flatt_loan = Flatten(name="flatt_loan")(embed_loan)

            input_contact = Input(shape=(1,), name="input_contact")
            embed_contact = Embedding(3, 2, name="embed_contact")(input_contact)
            flatt_contact = Flatten(name="flatt_contact")(embed_contact)

            input_day = Input(shape=(1,), name="input_day")
            embed_day = Embedding(32, dim_day, name="embed_day")(input_day)
            flatt_day = Flatten(name="flatt_day")(embed_day)

            input_month = Input(shape=(1,), name="input_month")
            embed_month = Embedding(13, 6, name="embed_month")(input_month)
            flatt_month = Flatten(name="flatt_month")(embed_month)

            input_campaign = Input(shape=(1,), name="input_campaign")
            embed_campaign = Embedding(6, 3, name="embed_campaign")(input_campaign)
            flatt_campaign = Flatten(name="flatt_campaign")(embed_campaign)

            input_previous = Input(shape=(1,), name="input_previous")
            embed_previous = Embedding(4, 3, name="embed_previous")(input_previous)
            flatt_previous = Flatten(name="flatt_previous")(embed_previous)

            input_poutcome = Input(shape=(1,), name="input_poutcome")
            embed_poutcome = Embedding(4, 3, name="embed_poutcome")(input_poutcome)
            flatt_poutcome = Flatten(name="flatt_poutcome")(embed_poutcome)

            input_duration = Input(shape=(1,), name="input_duration")
            dense_duration = Dense(dim_duration, name="fc_duration")(input_duration)

            input_dayofyear = Input(shape=(1,), name="input_dayofyear")
            dense_dayofyear = Dense(dim_dayofyear, name="fc_dayofyear")(input_dayofyear)

            # ネットワーク接続
            concat_layers = [
                flatt_age,
                flatt_job,
                flatt_marital,
                flatt_education,
                flatt_housing,
                flatt_loan,
                flatt_contact,
                flatt_day,
                flatt_month,
                flatt_campaign,
                flatt_previous,
                flatt_poutcome,
                dense_duration,
                dense_dayofyear,
            #    dense_cdp,
            ]

            x = Concatenate()(concat_layers)

            x = Activation("relu")(x)
            x = Dense(128, activation="relu")(x)
            x = Dense(64, activation="relu")(x)
            x = Dense(32, activation="relu")(x)
            x = Dense(16, activation="relu")(x)
            x = Dense(8, activation="relu")(x)

            output_layer = Dense(1, activation="sigmoid")(x)

            input_layers = [
                input_age,
                input_job,
                input_marital,
                input_education,
                input_housing,
                input_loan,
                input_contact,
                input_day,
                input_month,
                input_campaign,
                input_previous,
                input_poutcome,
                input_duration,
                input_dayofyear,
            #    input_cdp,
            ]

            model = Model(input_layers, output_layer)

            model.compile(
                loss=losses.mean_squared_error,
                optimizer=Adam(),
                metrics=[AUC()]
            )

            # 学習
            history = model.fit(
                train_x,
                train_label,
                validation_data=(val_x, val_label),
                #validation_split=0.2,
                batch_size=24,
                epochs=100,
                verbose=0,
                callbacks=[
                    EarlyStopping(
                        monitor="val_auc",
                        patience=3,
                        mode='max',
                        verbose=0,
                        restore_best_weights=True,
                    )
                ]
            )

            score = max(history.history['val_auc'][-4:])
            print(score)
            
            val_aucs.append(score)
            self._boosters.append(model)
            
        score = sum(val_aucs) / len(val_aucs)
        return score
    
    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_boosters = self._boosters

## ハイパーパラメータチューニング


In [14]:
objective = Objective()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, callbacks=[objective.callback])

0.9012647271156311
0.8925905227661133
0.5
0.8931202292442322
0.8856481909751892


[I 2020-08-29 01:43:05,677] Trial 0 finished with value: 0.8145247340202332 and parameters: {'dim_age': 49, 'dim_job': 10, 'dim_day': 13, 'dim_duration': 7, 'dim_dayofyear': 185}. Best is trial 0 with value: 0.8145247340202332.


0.9176896214485168
0.8948123455047607
0.8905540704727173
0.8811097741127014
0.881950855255127


[I 2020-08-29 01:56:37,752] Trial 1 finished with value: 0.8932233333587647 and parameters: {'dim_age': 70, 'dim_job': 2, 'dim_day': 22, 'dim_duration': 7, 'dim_dayofyear': 148}. Best is trial 1 with value: 0.8932233333587647.


0.8927920460700989
0.8898373246192932
0.8804799914360046
0.8909855484962463
0.8755835890769958


[I 2020-08-29 02:07:40,227] Trial 2 finished with value: 0.8859356999397278 and parameters: {'dim_age': 1, 'dim_job': 10, 'dim_day': 11, 'dim_duration': 1, 'dim_dayofyear': 52}. Best is trial 1 with value: 0.8932233333587647.


0.911544680595398
0.9079148173332214
0.8894345760345459
0.8963086009025574
0.8761250972747803


[I 2020-08-29 02:21:45,054] Trial 3 finished with value: 0.8962655544281006 and parameters: {'dim_age': 26, 'dim_job': 7, 'dim_day': 20, 'dim_duration': 4, 'dim_dayofyear': 24}. Best is trial 3 with value: 0.8962655544281006.


0.9007205367088318
0.9000857472419739
0.9039461016654968
0.8972258567810059
0.888151228427887


[I 2020-08-29 02:35:25,948] Trial 4 finished with value: 0.8980258941650391 and parameters: {'dim_age': 76, 'dim_job': 12, 'dim_day': 26, 'dim_duration': 1, 'dim_dayofyear': 45}. Best is trial 4 with value: 0.8980258941650391.


0.9075884819030762
0.9047018885612488
0.8998344540596008
0.8949704170227051
0.8727635145187378


[I 2020-08-29 02:49:37,244] Trial 5 finished with value: 0.8959717512130737 and parameters: {'dim_age': 44, 'dim_job': 11, 'dim_day': 16, 'dim_duration': 1, 'dim_dayofyear': 95}. Best is trial 4 with value: 0.8980258941650391.


0.9151551723480225
0.8966037631034851
0.9050310850143433
0.8934460878372192
0.8875579833984375


[I 2020-08-29 03:03:11,513] Trial 6 finished with value: 0.8995588183403015 and parameters: {'dim_age': 86, 'dim_job': 6, 'dim_day': 30, 'dim_duration': 5, 'dim_dayofyear': 26}. Best is trial 6 with value: 0.8995588183403015.


0.9095667004585266
0.8822121620178223
0.5051062107086182
0.8942375183105469
0.8928601145744324


[I 2020-08-29 03:15:23,265] Trial 7 finished with value: 0.8167965412139893 and parameters: {'dim_age': 82, 'dim_job': 9, 'dim_day': 22, 'dim_duration': 1, 'dim_dayofyear': 38}. Best is trial 6 with value: 0.8995588183403015.


0.9148347973823547
0.8995276689529419
0.9077818989753723
0.8735314607620239
0.8896715641021729


[I 2020-08-29 03:31:46,325] Trial 8 finished with value: 0.8970694780349732 and parameters: {'dim_age': 76, 'dim_job': 7, 'dim_day': 11, 'dim_duration': 9, 'dim_dayofyear': 140}. Best is trial 6 with value: 0.8995588183403015.


0.9108965396881104
0.887925922870636
0.8994548916816711
0.8887077569961548
0.8734116554260254


[I 2020-08-29 03:43:42,021] Trial 9 finished with value: 0.8920793533325195 and parameters: {'dim_age': 80, 'dim_job': 4, 'dim_day': 4, 'dim_duration': 1, 'dim_dayofyear': 85}. Best is trial 6 with value: 0.8995588183403015.


0.9132255911827087
0.9008768796920776
0.905912458896637
0.8939751386642456
0.8841971158981323


[I 2020-08-29 03:58:44,123] Trial 10 finished with value: 0.8996374368667602 and parameters: {'dim_age': 48, 'dim_job': 4, 'dim_day': 30, 'dim_duration': 4, 'dim_dayofyear': 17}. Best is trial 10 with value: 0.8996374368667602.


0.89037024974823
0.8981757164001465
0.9074195623397827
0.8943236470222473
0.8931994438171387


[I 2020-08-29 04:14:22,008] Trial 11 finished with value: 0.896697723865509 and parameters: {'dim_age': 55, 'dim_job': 4, 'dim_day': 31, 'dim_duration': 4, 'dim_dayofyear': 10}. Best is trial 10 with value: 0.8996374368667602.


0.9053505063056946
0.8927708268165588
0.904384195804596
0.8904359936714172
0.8911319971084595


[I 2020-08-29 04:27:21,886] Trial 12 finished with value: 0.8968147039413452 and parameters: {'dim_age': 21, 'dim_job': 4, 'dim_day': 31, 'dim_duration': 4, 'dim_dayofyear': 68}. Best is trial 10 with value: 0.8996374368667602.


0.9065423011779785
0.8848127722740173
0.8991492390632629
0.8913244009017944
0.8879469037055969


[I 2020-08-29 04:41:06,963] Trial 13 finished with value: 0.89395512342453 and parameters: {'dim_age': 32, 'dim_job': 1, 'dim_day': 28, 'dim_duration': 6, 'dim_dayofyear': 12}. Best is trial 10 with value: 0.8996374368667602.


0.9130377173423767
0.90215003490448
0.5
0.8927571773529053
0.8924822211265564


[I 2020-08-29 04:54:00,831] Trial 14 finished with value: 0.8200854301452637 and parameters: {'dim_age': 62, 'dim_job': 5, 'dim_day': 26, 'dim_duration': 3, 'dim_dayofyear': 134}. Best is trial 10 with value: 0.8996374368667602.


0.9161125421524048
0.8979077935218811
0.9048080444335938
0.891654908657074
0.5


[I 2020-08-29 05:05:38,872] Trial 15 finished with value: 0.8220966577529907 and parameters: {'dim_age': 7, 'dim_job': 6, 'dim_day': 31, 'dim_duration': 3, 'dim_dayofyear': 13}. Best is trial 10 with value: 0.8996374368667602.


0.8837816715240479
0.8769712448120117
0.8948113322257996
0.8757666349411011
0.8738437294960022


[I 2020-08-29 05:22:41,685] Trial 16 finished with value: 0.8810349225997924 and parameters: {'dim_age': 89, 'dim_job': 2, 'dim_day': 1, 'dim_duration': 5, 'dim_dayofyear': 69}. Best is trial 10 with value: 0.8996374368667602.


0.907365083694458
0.8989626169204712
0.8971294164657593
0.8987180590629578
0.8903450965881348


[I 2020-08-29 05:36:22,932] Trial 17 finished with value: 0.8985040545463562 and parameters: {'dim_age': 90, 'dim_job': 8, 'dim_day': 27, 'dim_duration': 9, 'dim_dayofyear': 114}. Best is trial 10 with value: 0.8996374368667602.


0.8805152177810669
0.8959172368049622
0.9007870554924011
0.8788589239120483
0.8908132314682007


[I 2020-08-29 05:46:23,771] Trial 18 finished with value: 0.8893783330917359 and parameters: {'dim_age': 38, 'dim_job': 3, 'dim_day': 18, 'dim_duration': 6, 'dim_dayofyear': 33}. Best is trial 10 with value: 0.8996374368667602.


0.9066810011863708
0.8869205713272095
0.905071496963501
0.9011879563331604
0.8941394686698914


[I 2020-08-29 06:02:29,503] Trial 19 finished with value: 0.8988000988960266 and parameters: {'dim_age': 15, 'dim_job': 6, 'dim_day': 24, 'dim_duration': 7, 'dim_dayofyear': 63}. Best is trial 10 with value: 0.8996374368667602.


In [15]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 20
Best trial:
  Value: 0.8996374368667602
  Params: 
    dim_age: 48
    dim_job: 4
    dim_day: 30
    dim_duration: 4
    dim_dayofyear: 17


## 評価

In [16]:
optuna.importance.get_param_importances(study)

OrderedDict([('dim_dayofyear', 0.4020900536006124),
             ('dim_job', 0.22452587076375227),
             ('dim_duration', 0.19621970567406766),
             ('dim_age', 0.1235048512718301),
             ('dim_day', 0.05365951868973766)])

## 推論

In [17]:
best_models = objective.best_boosters

test_preds = []

for model in best_models:
    x = {
        "input_age": test_df[["age"]],
        "input_job": test_df[["job"]],
        "input_marital": test_df[["marital"]],
        'input_education': test_df[["education"]],
        "input_housing": test_df[["housing"]],
        "input_loan": test_df[["loan"]],
        "input_contact": test_df[["contact"]],
        "input_day": test_df[["day"]],
        "input_month": test_df[["month"]],
        "input_campaign": test_df[["campaign"]],
        "input_previous": test_df[["previous"]],
        "input_poutcome": test_df[["poutcome"]],
        "input_duration": test_df[["duration"]],
        "input_dayofyear": test_df[["dayofyear"]],
    #    "input_cdp": train_df[["cdp"]],
    }    
    
    test_preds.append(model.predict(x))

test_pred = np.average(np.array(test_preds), axis=0)

In [18]:
test_pred

array([[0.94222814],
       [0.05698726],
       [0.01280272],
       ...,
       [0.02887951],
       [0.00715769],
       [0.065445  ]], dtype=float32)

In [19]:
submit_df[1] = test_pred
submit_df.to_csv('submit-nn.csv', header=False, index=False)