In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer

In [5]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_indices, test_indices in sss.split(data, data[['Survived', 'Pclass', 'Sex']]):
    train_set, test_set = data.loc[train_indices], data.loc[test_indices]
    
train_set.head()

y_train, y_test = train_set['Survived'], test_set['Survived']

In [6]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean'))
])

categoric_pipeline = Pipeline(steps=[
    ('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

column_transformer = ColumnTransformer(transformers=[
    ('impute-numeric', numeric_pipeline, ['Age']),
    ('one-hot-encode', categoric_pipeline, ['Sex']),
    ('keep', 'passthrough' , ['Pclass', 'SibSp', 'Parch', 'Fare'])
])

scaler = StandardScaler()

In [7]:
X_train = scaler.fit_transform(column_transformer.fit_transform(train_set))
X_test = scaler.fit_transform(column_transformer.fit_transform(test_set))

X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)
X_train.describe()
X_test.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,179.0,179.0,179.0,179.0,179.0,179.0,179.0
mean,1.535705e-15,-1.290092e-16,-9.675687e-17,-3.101182e-17,4.465702e-17,-1.9847560000000003e-17,-7.442836000000001e-17
std,1.002805,1.002805,1.002805,1.002805,1.002805,1.002805,1.002805
min,-2.453097,-0.7460038,-1.340476,-1.55751,-0.4787702,-0.4559118,-0.7128721
25%,-0.5121284,-0.7460038,-1.340476,-0.3660814,-0.4787702,-0.4559118,-0.5243406
50%,1.462194e-15,-0.7460038,0.7460038,0.8253472,-0.4787702,-0.4559118,-0.3967013
75%,0.4139063,1.340476,0.7460038,0.8253472,0.4950919,-0.4559118,-0.06330732
max,3.33606,1.340476,0.7460038,0.8253472,7.312127,6.251612,5.431967


In [38]:
from sklearn.neural_network import MLPClassifier

In [39]:
mlp = MLPClassifier(random_state=0)
mlp.fit(X_train, y_train)
y_result = mlp.predict(X_test)



In [44]:
def get_false_indices(y_result, y_test):
    false_indices = []
    for index, prediction in enumerate(y_result):
        if(prediction != y_test.to_numpy()[index]):
            false_indices.append(index)

    return false_indices

print(mlp.score(X_test, y_test))

0.8100558659217877


In [49]:
X_final = scaler.fit_transform(column_transformer.fit_transform(data))
y_final = data['Survived']

y_result_final = mlp.predict(X_final)

In [50]:
print(y_result_final)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1
 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1
 1 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0
 0 1 0 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0
 0 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0
 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1
 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 0 0 0 

In [53]:
false_indices = get_false_indices(y_result_final, y_final)
data['Correct'] = np.nan

for index, row in data.iterrows():
    if(index in false_indices):
        row['Correct']=False
    else:
        row['Correct']=True

data['Survived'] = y_result_final

data.to_csv('final.csv', index=False)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Correct
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,
