#### Kaggle : https://www.kaggle.com/isaienkov/keras-neural-network-architecture-optimization

# Optimization

## importing libraries and Packages

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Warning 빨간 창 무시하기
import warnings
warnings.simplefilter('ignore')

import os

## 데이터 관리

In [2]:
os.listdir("../Kaggle_Titanic_Optimization/Input/")

['gender_submission.csv', 'test.csv', 'train.csv']

In [3]:
train = pd.read_csv("../Kaggle_Titanic_Optimization/Input/train.csv")
test = pd.read_csv("../Kaggle_Titanic_Optimization/Input/test.csv")

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 데이터 전처리

In [5]:
train['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [6]:
# expand False 는 행렬로 넣음(확장 x)
train['Name'].str.split(',',expand=False)

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [7]:
# expand true 는 배열 수만큼 열을 추가함
train['Name'].str.split(',',expand=True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [8]:
# 성, 이름 확인하여 가족인지 아닌지 구분하기 위함
train['LastName'] = train['Name'].str.split(',',expand=True)[0]
test['LastName'] = test['Name'].str.split(',',expand=True)[0]
ds = pd.concat([train,test])

In [9]:
ds['LastName']

0             Braund
1            Cumings
2          Heikkinen
3           Futrelle
4              Allen
           ...      
413          Spector
414    Oliva y Ocana
415          Saether
416             Ware
417            Peter
Name: LastName, Length: 1309, dtype: object

In [10]:
ds

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Braund
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Futrelle
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Allen
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Spector
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Oliva y Ocana
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Saether
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Ware


In [11]:
# ds['LastName']==row['LastName'] 에서는 해당 row에서 ds에 있는 값과 같을 때 True
# ds[(ds['LastName']==row['LastName'])] 는 그 경우들만 모아줌
for index, row in ds.iterrows():
    s = ds[(ds['LastName']==row['LastName'])]
s

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName
128,129,1.0,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C,Peter
533,534,1.0,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C,Peter
417,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,Peter


**가족 생존여부 표현 (NumericaVariable_DescreteData)**

In [12]:
sur = []
died = []

for index, row in ds.iterrows():
    # LastName 이 같은 사람(가족)
    s = ds[(ds['LastName']==row['LastName']) & (ds['Survived']==1)]
    d = ds[(ds['LastName']==row['LastName']) & (ds['Survived']==0)]
    
    # 가족들중 살아있는 사람수,죽은 사람수 (자신 포함)
    s=len(s)
    d=len(d)
    
    # 자신 제외하고 가족들 생존,죽음 표현하기 위함
    if row['Survived'] == 1:
        s-=1
    if row['Survived'] == 0:
        d-=1
    sur.append(s)
    died.append(d)
    
ds['FamilySurvived'] = sur
ds['FamilyDied'] = died

# 가족 수 표현
ds['FamilySize'] = ds['SibSp'] + ds['Parch'] + 1

# Alone 0으로 초기화 후 FamilySize =1 (혼자)인경우 1로 표현
ds['IsAlone'] = 0
ds.loc[ds['FamilySize'] == 1, 'IsAlone'] = 1

In [13]:
# fillna 를 이용해서 Fare 에서의 Nan 값을 데이터의 중간값으로 대체함
ds['Fare'] = ds['Fare'].fillna(train['Fare'].median())

# fillna 를 이용해서 Embarked 에서의 Nan 값을 Q로 대체함
ds['Embarked'] = ds['Embarked'].fillna('Q')

In [14]:
# train 값은 Survived 값이 존재함, test는 존재하지 않음
# 따라서, train, test를 구분할수 있음
train = ds[ds['Survived'].notnull()]
test = ds[ds['Survived'].isnull()]

# 구분 후 Survived 값이 다 Nan 인 test set이므로 Survived 제거
test = test.drop(['Survived'], axis=1)

In [15]:
train['rich_woman'] = 0
test['rich_woman'] = 0
train['men_3'] = 0
test['men_3'] = 0

train.loc[(train['Pclass']<=2) & (train['Sex']=='female'), 'rich_woman'] = 1
test.loc[(test['Pclass']<=2) & (test['Sex']=='female'), 'rich_woman'] = 1
train.loc[(train['Pclass']==3) & (train['Sex']=='male'), 'men_3'] = 1
test.loc[(test['Pclass']==3) & (test['Sex']=='male'), 'men_3'] = 1

In [16]:
train['rich_woman']

0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    0
890    0
Name: rich_woman, Length: 891, dtype: int64

In [17]:
train['rich_woman'] = train['rich_woman'].astype(np.int8)
test['rich_woman'] = test['rich_woman'].astype(np.int8)

In [18]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,FamilySurvived,FamilyDied,FamilySize,IsAlone,rich_woman,men_3
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,0,1,2,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,0,0,2,0,1,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,0,0,1,1,0,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle,0,1,2,0,1,0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen,1,0,1,1,0,1


In [19]:
for i in train['Cabin']:
    print(i)

nan
C85
nan
C123
nan
nan
E46
nan
nan
nan
G6
C103
nan
nan
nan
nan
nan
nan
nan
nan
nan
D56
nan
A6
nan
nan
nan
C23 C25 C27
nan
nan
nan
B78
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
D33
nan
B30
C52
nan
nan
nan
nan
nan
B28
C83
nan
nan
nan
F33
nan
nan
nan
nan
nan
nan
nan
nan
F G73
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
C23 C25 C27
nan
nan
nan
E31
nan
nan
nan
A5
D10 D12
nan
nan
nan
nan
D26
nan
nan
nan
nan
nan
nan
nan
C110
nan
nan
nan
nan
nan
nan
nan
B58 B60
nan
nan
nan
nan
E101
D26
nan
nan
nan
F E69
nan
nan
nan
nan
nan
nan
nan
D47
C123
nan
B86
nan
nan
nan
nan
nan
nan
nan
nan
F2
nan
nan
C2
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
E33
nan
nan
nan
B19
nan
nan
nan
A7
nan
nan
C49
nan
nan
nan
nan
nan
F4
nan
A32
nan
nan
nan
nan
nan
nan
nan
F2
B4
B80
nan
nan
nan
nan
nan
nan
nan
nan
nan
G6
nan
nan
nan
A31
nan
nan
nan
nan
nan
D36
nan
nan
D15
nan
nan
nan
nan
nan
C93
nan
nan
nan
nan
nan
C83
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
n

In [20]:
# Cabin 이 여러개 값을 받을 수도 있으므로 0번째 값으로 i[0] 표현
# Nan 값이면 X로 표현
train["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in train['Cabin']])
test['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in test['Cabin']])

In [21]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,FamilySurvived,FamilyDied,FamilySize,IsAlone,rich_woman,men_3
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,X,S,Braund,0,1,2,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,Cumings,0,0,2,0,1,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,X,S,Heikkinen,0,0,1,1,0,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,Futrelle,0,1,2,0,1,0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,X,S,Allen,1,0,1,1,0,1


In [22]:
Drop_List = ['PassengerId','Ticket', 'LastName', 'SibSp', 'Parch']

# 사용 다한 부분은 없애주기
for drop in Drop_List:
    train = train.drop([drop], axis=1)
    test = test.drop([drop], axis=1)

In [23]:
categorical = ['Pclass', 'Sex', 'Embarked', 'Cabin']

# prefix 는 접두사 추가해줌 ex) Sex_femail
# pd concat 이용하여 dummy로 생성된 열들을 추가해줌
# 그 후 다 사용하였으니 사용한 categorical 열을 삭제해줌
for cat in categorical:
    train = pd.concat([train, pd.get_dummies(train[cat], prefix=cat)], axis=1)
    train = train.drop([cat], axis=1)
    test = pd.concat([test, pd.get_dummies(test[cat], prefix=cat)], axis=1)
    test = test.drop([cat], axis=1)

In [24]:
train.head()

Unnamed: 0,Survived,Name,Age,Fare,FamilySurvived,FamilyDied,FamilySize,IsAlone,rich_woman,men_3,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_X
0,0.0,"Braund, Mr. Owen Harris",22.0,7.25,0,1,2,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,71.2833,0,0,2,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,1.0,"Heikkinen, Miss. Laina",26.0,7.925,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,53.1,0,1,2,0,1,0,...,1,0,0,1,0,0,0,0,0,0
4,0.0,"Allen, Mr. William Henry",35.0,8.05,1,0,1,1,0,1,...,1,0,0,0,0,0,0,0,0,1


In [25]:
# 이름은 사용했으므로 없애주기
train = train.drop(['Name'], axis=1)
test =  test.drop(['Name'], axis=1)

# 나이에서 Nan값 존재
train = train.fillna(-1)
test = test.fillna(-1)

## 파라미터들 값 지정

In [26]:
EPOCHS = 15

initial_keras_params = {
    'layers_number': 1,
    'n_units_l_0': 128,
    'activation_l_0': 'relu',
    'dropout_l_0': 0.5,
    'lr': 0.001
}

## 모델 만들기

In [27]:
def keras_classifier(parameters):
    
    model = Sequential()
    layers_number = int(parameters['layers_number'])
    
    for i in range(layers_number):
        model.add(Dense(int(parameters['n_units_l_' + str(i)]), activation=parameters['activation_l_' + str(i)]))
        model.add(Dropout(int(parameters['dropout_l_' + str(i)])))
    model.add(Dense(2, activation='softmax'))
    model.compile(
        loss='categorical_crossentropy', 
        optimizer=tf.keras.optimizers.Adam(lr=float(parameters['lr'])), 
        metrics=['accuracy']
    )
    return model

In [28]:
model = keras_classifier(initial_keras_params)

In [29]:
# x는 문제 부분이므로 Survived 제거, y는 정답부분
y = train['Survived']
y = tf.keras.utils.to_categorical(y, num_classes=2, dtype='float32')
x = train.drop(['Survived', 'Cabin_T'], axis=1)
x_test = test.copy()

x, x_val, y, y_val = train_test_split(x, y, random_state=0, test_size=0.2, shuffle=False)

In [30]:
model.fit(x, y, validation_split=0.2, epochs=EPOCHS, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2b35be74310>

In [31]:
preds = model.predict(x_val)
preds = np.argmax(preds, axis=1)

print('accuracy: ', accuracy_score(np.argmax(y_val, axis=1), preds))
print('f1-score: ', f1_score(np.argmax(y_val, axis=1), preds))

accuracy:  0.8435754189944135
f1-score:  0.7666666666666666


In [32]:
def create_model(trial):
    n_layers = trial.suggest_int("layers_number", 1, 2)
    model = Sequential()
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l_{}".format(i), 2, 16)
        activation = trial.suggest_categorical('activation_l_{}'.format(i), ['relu', 'sigmoid', 'tanh', 'elu'])
        model.add(Dense(num_hidden, activation=activation))
        dropout = trial.suggest_uniform("dropout_l_{}".format(i), 0.1, 0.4)
        model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))

    lr = trial.suggest_loguniform("lr", 1e-5, 1e-1)

    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(lr=lr),
        metrics=['accuracy']
    )

    return model

In [33]:
def objective(trial):
    model = create_model(trial)
    
    epochs = trial.suggest_int("epochs", 3, 20)
    batch = trial.suggest_int("batch", 1, x.shape[0] / 4)
    
    model.fit(
        x, 
        y, 
        batch_size=batch, 
        epochs=epochs, 
        verbose=0
    )
    preds = model.predict(x_val)
    return accuracy_score(np.argmax(y_val, axis=1), np.argmax(preds, axis=1))

In [34]:
def optimize():
    sampler = TPESampler(seed=666)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=10)
    return study.best_params

In [35]:
params = optimize()

[32m[I 2021-01-20 23:24:14,744][0m A new study created in memory with name: no-name-0a78bd8a-98f6-43dd-8bac-de3052f8048f[0m
[32m[I 2021-01-20 23:24:15,871][0m Trial 0 finished with value: 0.6424581005586593 and parameters: {'layers_number': 1, 'n_units_l_0': 4, 'activation_l_0': 'sigmoid', 'dropout_l_0': 0.16339983197267577, 'lr': 3.2802532080132065e-05, 'epochs': 7, 'batch': 92}. Best is trial 0 with value: 0.6424581005586593.[0m
[32m[I 2021-01-20 23:24:16,997][0m Trial 1 finished with value: 0.6759776536312849 and parameters: {'layers_number': 1, 'n_units_l_0': 5, 'activation_l_0': 'relu', 'dropout_l_0': 0.11464383814000001, 'lr': 2.510234217788663e-05, 'epochs': 19, 'batch': 47}. Best is trial 1 with value: 0.6759776536312849.[0m
[32m[I 2021-01-20 23:24:17,875][0m Trial 2 finished with value: 0.7430167597765364 and parameters: {'layers_number': 1, 'n_units_l_0': 9, 'activation_l_0': 'sigmoid', 'dropout_l_0': 0.15786760091217514, 'lr': 0.0063588562697108894, 'epochs': 8, '

In [36]:
params

{'layers_number': 2,
 'n_units_l_0': 15,
 'activation_l_0': 'sigmoid',
 'dropout_l_0': 0.1238026380562101,
 'n_units_l_1': 10,
 'activation_l_1': 'elu',
 'dropout_l_1': 0.3938006747405197,
 'lr': 0.04461194337544787,
 'epochs': 18,
 'batch': 148}

In [37]:
epochs = params['epochs']
batch = params['batch']
del params['epochs']
del params['batch']

opt_model = keras_classifier(params)
opt_model.fit(x, y, validation_split=0.2, epochs=epochs, batch_size=batch)

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<tensorflow.python.keras.callbacks.History at 0x2b360832550>

In [38]:
preds = opt_model.predict(x_val)
preds = np.argmax(preds, axis=1)
print('accuracy: ', accuracy_score(np.argmax(y_val, axis=1), preds))
print('f1-score: ', f1_score(np.argmax(y_val, axis=1), preds))

accuracy:  0.8491620111731844
f1-score:  0.7804878048780488


In [39]:
preds = opt_model.predict(x_test)
preds = np.argmax(preds, axis=1)
preds = preds.astype(np.int16)

In [40]:
submission = pd.read_csv("../Kaggle_Titanic_Optimization/Input/gender_submission.csv")
submission['Survived'] = preds
submission.to_csv('submission.csv', index=False)

In [41]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
