### Importing and getting dataset

In [408]:
import numpy as np
import pandas as pd

In [409]:
train_dataset = pd.read_csv('./data/train.csv')

In [410]:
X = train_dataset.copy().drop('Survived', axis=1)
y = train_dataset['Survived']

In [411]:
X = X.drop(['Fare', 'Ticket'], axis=1)
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,C148,C


---
### Replacing NANs

In [412]:
girls_df = X.copy()[X['Name'].str.contains('Miss')]
fill_na = np.median(girls_df['Age'].value_counts()[girls_df.copy()['Age'].value_counts() > 3].index.sort_values())
girls_df['Age'] = girls_df['Age'].fillna(value=fill_na)
fill_na

18.0

In [413]:
boys_df = X.copy()[X['Name'].str.contains('Master')]
fill_na = np.median(boys_df['Age'].value_counts()[boys_df['Age'].value_counts() > 3].index.sort_values())
boys_df['Age'] = boys_df['Age'].fillna(value=fill_na)
fill_na

3.0

In [414]:
women_df = X.copy()[~X['Name'].str.contains('Miss')]
women_df = women_df.copy()[women_df['Sex'] == 'female']
fill_na = np.median(women_df['Age'].value_counts()[women_df['Age'].value_counts() > 3].index.sort_values())
women_df['Age'] = women_df['Age'].fillna(value=fill_na)
fill_na

35.0

In [415]:
men_df = X.copy()[~X['Name'].str.contains('Master')]
men_df = men_df.copy()[men_df['Sex'] == 'male']
fill_na = np.median(men_df['Age'].value_counts()[men_df['Age'].value_counts() > 3].index.sort_values())
men_df['Age'] = men_df['Age'].fillna(value=fill_na)
fill_na

33.0

In [416]:
new_dataset = pd.concat([girls_df, boys_df, women_df, men_df])

In [417]:
value_for_embarked_na = new_dataset['Embarked'].value_counts().index[0]
new_dataset['Embarked'] = new_dataset['Embarked'].fillna(value_for_embarked_na)

In [418]:
pclass_1 = new_dataset.copy()[new_dataset['Pclass'] == 1]
pclass_other = new_dataset.copy()[new_dataset['Pclass'] != 1]

In [419]:
to_na_1 = pclass_1['Cabin'].str[0].value_counts().index[0]
to_na_1

'C'

In [420]:
to_na_other = 'Unknown'

In [421]:
pclass_1['Cabin'] = pclass_1['Cabin'].fillna(to_na_1)
pclass_other['Cabin'] = pclass_other['Cabin'].fillna(to_na_other)

In [422]:
df_ = pd.concat([pclass_1, pclass_other])
df_

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,C103,S
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,B28,S
88,89,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,C23 C25 C27,S
136,137,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,D47,S
177,178,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,C49,C
...,...,...,...,...,...,...,...,...,...
881,882,3,"Markun, Mr. Johann",male,33.0,0,0,Unknown,S
883,884,2,"Banfield, Mr. Frederick James",male,28.0,0,0,Unknown,S
884,885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,Unknown,S
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,Unknown,S


In [423]:
cabin_dataset = df_.copy()[df_['Cabin'] != 'Unknown']
unknown_dataset = df_.copy()[df_['Cabin'] == 'Unknown']

str_cabin = cabin_dataset.copy()['Cabin'].str[0]
num_of_cabins = cabin_dataset.copy()['Cabin'].str.split().str.len()

cabin_dataset['Cabin Class'] = str_cabin
cabin_dataset['Num Of Cabins'] = num_of_cabins

unknown_dataset['Cabin Class'] = 'none'
unknown_dataset['Num Of Cabins'] = 0

In [424]:
cabin_data = pd.concat([cabin_dataset, unknown_dataset])
cabin_data = cabin_data.sort_index()

In [425]:
new_dataset = cabin_data.copy()
new_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked,Cabin Class,Num Of Cabins
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,Unknown,S,none,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C85,C,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,Unknown,S,none,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,C123,S,C,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,Unknown,S,none,0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,Unknown,S,none,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,B42,S,B,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,18.0,1,2,Unknown,S,none,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,C148,C,C,1


In [426]:
def name_extract(word):
    return word.split(',')[1].split('.')[0].strip()

In [446]:
new_dataset['Salutation'] = new_dataset.copy()['Name'].apply(name_extract)
new_dataset.loc[~new_dataset.copy()['Salutation'].isin(['Mr', 'Miss', 'Mrs', 'Master']), 'Salutation'] = 'Other'
new_dataset.count()

PassengerId      891
Pclass           891
Name             891
Sex              891
Age              891
SibSp            891
Parch            891
Cabin            891
Embarked         891
Cabin Class      891
Num Of Cabins    891
Salutation       891
dtype: int64

In [428]:
new_dataset['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

---
### Encoding Features

In [447]:
from sklearn.preprocessing import OneHotEncoder

In [448]:
X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]

In [450]:
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X_str)

categories = list(encoder.categories_[0])
categories.extend(encoder.categories_[1])
categories.extend(encoder.categories_[2])
categories.extend(encoder.categories_[3])
categories

['female',
 'male',
 'C',
 'Q',
 'S',
 'Master',
 'Miss',
 'Mr',
 'Mrs',
 'Other',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'T',
 'none']

In [451]:
new_X[categories] = X_encoded
new_X = new_X.sort_index()
new_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Num Of Cabins,female,male,C,Q,S,...,Mrs,Other,A,B,D,E,F,G,T,none
0,3,22.0,1,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,38.0,1,0,1,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,26.0,0,0,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,1,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,35.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,19.0,0,0,1,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,3,18.0,1,2,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,1,26.0,0,0,1,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Getting Polynomial Features

In [452]:
from sklearn.preprocessing import PolynomialFeatures

In [453]:
poly = PolynomialFeatures(2)

In [454]:
X_poly = poly.fit_transform(new_X)
columns = poly.get_feature_names()
poly_df = pd.DataFrame(X_poly, columns=columns)

In [455]:
new_X = poly_df.copy()
new_X

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x19^2,x19 x20,x19 x21,x19 x22,x20^2,x20 x21,x20 x22,x21^2,x21 x22,x22^2
0,1.0,3.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,3.0,26.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,35.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,3.0,35.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1.0,2.0,27.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,1.0,3.0,18.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,1.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Scaling

In [456]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
X_scaled = scaler.fit_transform(new_X)

---
### Feature Selection

In [457]:
from sklearn.tree import DecisionTreeClassifier

In [458]:
'''xgb = XGBRegressor(n_estimators=1000)
xgb.fit(X_scaled, y)'''

'xgb = XGBRegressor(n_estimators=1000)\nxgb.fit(X_scaled, y)'

In [459]:
'''xgb.feature_importances_'''

'xgb.feature_importances_'

In [460]:
dt = DecisionTreeClassifier(random_state=9, criterion='entropy', max_depth=50)
dt.fit(X_scaled, y)

DecisionTreeClassifier(criterion='entropy', max_depth=50, random_state=9)

In [461]:
importance = dt.feature_importances_
#df = pd.DataFrame(importance, index=columns, columns=['importance'])
#columns_keep = df[df['importance'] > 0].sort_values(by='importance', ascending=False).index.tolist()

In [603]:
# Feature Importance
fi_col = []
fi = []

for i, column in enumerate(new_X):
    fi_col.append(column)
    fi.append(dt.feature_importances_[i])

fi_df = zip(fi_col, fi)
fi_df = pd.DataFrame(fi_df, columns=['feature', 'feature_importance'])

fi_df = fi_df.sort_values('feature_importance', ascending=False).reset_index()

columns_to_keep = fi_df[fi_df['feature_importance'] >= 0.001]['feature']
columns_to_keep

0       x1 x6
1          x0
2       x0 x1
3       x1 x9
4      x1 x22
5       x2 x3
6       x1 x2
7        x1^2
8          x1
9      x1 x12
10      x1 x5
11     x1 x13
12      x1 x3
13     x1 x11
14     x1 x15
15      x10^2
16      x1 x4
17      x0 x2
18      x1 x8
19     x8 x11
20     x1 x14
21     x2 x16
22    x11 x22
23      x3 x4
24     x1 x17
25     x6 x18
26      x3 x9
27      x3 x7
28      x0 x3
29      x0 x4
30      x2 x9
31    x11 x20
32      x2 x8
33      x3 x5
34       x9^2
35     x1 x16
36      x1 x7
37      x2 x4
38     x3 x11
39         x9
40      x2 x5
41      x4 x7
42       x3^2
43     x6 x15
44     x2 x22
45         x2
46      x2 x6
47     x1 x10
48      x2 x7
49      x0 x8
Name: feature, dtype: object

In [604]:
new_X_keep = new_X.copy()[columns_to_keep]
new_X_keep

Unnamed: 0,x1 x6,x0,x0 x1,x1 x9,x1 x22,x2 x3,x1 x2,x1^2,x1,x1 x12,...,x2 x5,x4 x7,x3^2,x6 x15,x2 x22,x2,x2 x6,x1 x10,x2 x7,x0 x8
0,22.0,3.0,66.0,22.0,22.0,0.0,22.0,484.0,22.0,22.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1,0.0,1.0,38.0,0.0,0.0,0.0,38.0,1444.0,38.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,3.0,78.0,26.0,26.0,0.0,0.0,676.0,26.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,35.0,35.0,0.0,0.0,35.0,1225.0,35.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,35.0,3.0,105.0,35.0,35.0,0.0,0.0,1225.0,35.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,2.0,54.0,27.0,27.0,0.0,0.0,729.0,27.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,0.0,1.0,19.0,19.0,0.0,0.0,0.0,361.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.0,3.0,54.0,18.0,18.0,2.0,18.0,324.0,18.0,0.0,...,1.0,0.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
889,26.0,1.0,26.0,0.0,0.0,0.0,0.0,676.0,26.0,26.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Model

In [605]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, classification_report

In [606]:
X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.2, random_state=13)
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=None)

#### LogisticRegression

In [607]:
from sklearn.linear_model import LogisticRegression

In [608]:
model = LogisticRegression(max_iter=1e6, C=0.7e-2, solver='lbfgs', random_state=None, n_jobs=-1)
model.fit(X_train, y_train)

y_train_predicted = model.predict(X_train)
print('Train score:', model.score(X_train, y_train) * 100)
print('Train Logistic Loss', log_loss(y_train, y_train_predicted))

y_test_predicted = model.predict(X_test)
print('Test score:', model.score(X_test, y_test) * 100)
print('Test Logistic Loss', log_loss(y_test, y_test_predicted))

'''y_valid_predicted = model.predict(X_valid)
print('Valid score:', model.score(X_valid, y_valid) * 100)
print('Valid Logistic Loss', log_loss(y_valid, y_valid_predicted))'''

Train score: 83.28651685393258
Train Logistic Loss 5.77268647706604
Test score: 86.03351955307262
Test Logistic Loss 4.823896122050665


"y_valid_predicted = model.predict(X_valid)\nprint('Valid score:', model.score(X_valid, y_valid) * 100)\nprint('Valid Logistic Loss', log_loss(y_valid, y_valid_predicted))"

#### SVM

In [609]:
from sklearn.svm import LinearSVC

#### Neural network

In [610]:
from sklearn.neural_network import MLPClassifier

In [None]:
max_test_score, max_train_score, max_score, our_r_s = 0, 0, 0., -1
for r_s in range(0, 1100):
    X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.21, random_state=1054)
    mlp = MLPClassifier(hidden_layer_sizes=(82), activation='logistic', alpha=0.9e-5,
                        epsilon=1e-7, solver='adam', max_iter=100000000000, random_state=r_s)
    mlp.fit(X_train, y_train.values.ravel())

    #predictions_train = mlp.predict(X_train)
    #print(classification_report(y_train, predictions_train))
    
    train_score = mlp.score(X_train, y_train)
    test_score = mlp.score(X_test, y_test)
    score = (1 / 2) * (train_score + test_score)
    if score > max_score or max_test_score < test_score:
        if max_test_score < test_score:
            max_test_score = test_score 
        our_r_s = r_s
        if max_score < score:
            max_score = score 
        print(train_score, test_score, score, our_r_s)
    #predictions_test = mlp.predict(X_test)
    #print(classification_report(y_test, predictions_test))

0.8691322901849218 0.8670212765957447 0.8680767833903332 0
0.8790896159317212 0.8617021276595744 0.8703958717956478 1
0.8762446657183499 0.8723404255319149 0.8742925456251325 2
0.8819345661450925 0.8723404255319149 0.8771374958385036 3
0.8776671408250356 0.8776595744680851 0.8776633576465603 6


In [571]:
# X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
# new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]

# 0.8790896159317212 0.898936170212766 0.8890128930722436

X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.21, random_state=1054)
mlp = MLPClassifier(hidden_layer_sizes=(60), activation='logistic', alpha=0.85e-5, 
                    epsilon=1.2e-7, solver='adam', max_iter=100000000000, random_state=335)

mlp.fit(X_train, y_train.values.ravel())


predictions_train = mlp.predict(X_train)
y_train_predicted = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))
print('Train Logistic Loss', log_loss(y_train, y_train_predicted))

predictions_test = mlp.predict(X_test)
y_test_predicted = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))
print('Test Logistic Loss', log_loss(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91       434
           1       0.93      0.74      0.83       269

    accuracy                           0.88       703
   macro avg       0.90      0.85      0.87       703
weighted avg       0.89      0.88      0.88       703

Train Logistic Loss 4.126982093224069
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       115
           1       0.92      0.77      0.84        73

    accuracy                           0.88       188
   macro avg       0.89      0.86      0.87       188
weighted avg       0.89      0.88      0.88       188

Test Logistic Loss 4.041792971676522


In [525]:
# X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
# new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]

# 0.8492176386913229  0.8829787234042553  0.8660981810477891  train_random_state = 41
# 0.8406827880512091  0.898936170212766   0.8698094791319875  train_random_state = 1054

# 0.8520625889046942 0.9042553191489362 0.8781589540268152 hidden_layer_sizes = 38

# 0.844950213371266 0.9148936170212766 0.8799219151962713 nn_random_state = 615

# 0.8406827880512091 0.9202127659574468 0.880447777004328  epsilon=1e-7

# 0.8421052631578947 0.9202127659574468 0.8811590145576708  alpha=0.3e-5

X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.21, random_state=1054)
mlp = MLPClassifier(hidden_layer_sizes=(38), activation='relu', alpha=0.2e-5, 
                    epsilon=1e-8, solver='adam', max_iter=100000000000, random_state=615)
mlp.fit(X_train, y_train.values.ravel())


predictions_train = mlp.predict(X_train)
y_train_predicted = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))
print('Train Logistic Loss', log_loss(y_train, y_train_predicted))

predictions_test = mlp.predict(X_test)
y_test_predicted = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))
print('Test Logistic Loss', log_loss(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       434
           1       0.78      0.69      0.73       269

    accuracy                           0.81       703
   macro avg       0.80      0.79      0.79       703
weighted avg       0.81      0.81      0.81       703

Train Logistic Loss 6.632683346201013
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       115
           1       0.87      0.81      0.84        73

    accuracy                           0.88       188
   macro avg       0.88      0.86      0.87       188
weighted avg       0.88      0.88      0.88       188

Test Logistic Loss 4.225526880105413


In [501]:
# X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation']]
# new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch']]

# 0.9210526315789473 

X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=17, random_state=1054)
mlp = MLPClassifier(hidden_layer_sizes=(23), activation='relu', alpha=1.1e-5, 
                    epsilon=1e-8, solver='adam', max_iter=100000000000, random_state=245)
mlp.fit(X_train, y_train.values.ravel())

predictions_train = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))

predictions_test = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89       538
           1       0.92      0.66      0.77       336

    accuracy                           0.85       874
   macro avg       0.87      0.81      0.83       874
weighted avg       0.86      0.85      0.84       874

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.83      0.91         6

    accuracy                           0.94        17
   macro avg       0.96      0.92      0.93        17
weighted avg       0.95      0.94      0.94        17



In [48]:
# X_str = new_dataset.copy()[['Sex', 'Embarked']]
# new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch']]

# 0.85

X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.17, random_state=568)

mlp = MLPClassifier(hidden_layer_sizes=(15), activation='relu', alpha=1e-5, 
                        epsilon=1e-8, solver='adam', max_iter=100000000000, random_state=402)
mlp.fit(X_train, y_train.values.ravel())

predictions_train = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))

predictions_test = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       452
           1       0.84      0.73      0.78       287

    accuracy                           0.84       739
   macro avg       0.84      0.82      0.83       739
weighted avg       0.84      0.84      0.84       739

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        97
           1       0.80      0.80      0.80        55

    accuracy                           0.86       152
   macro avg       0.84      0.84      0.84       152
weighted avg       0.86      0.86      0.86       152



---
### Getting survivors from test dataset

In [49]:
test_dataset = pd.read_csv('./data/test.csv')