In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_extraction import DictVectorizer as DV

In [186]:
classifier = MLPClassifier(hidden_layer_sizes = 20, max_iter = 1000, solver='adam')

In [236]:
X = pd.read_csv('train.csv')
columns_to_save = [X.columns]
y = X['Survived']
X = X.drop('Survived', 1)
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [237]:
X.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [238]:
numeric_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_cols = list(set(X.columns.values.tolist()) - set(numeric_cols))

In [239]:
def calculate_means(numeric_data):
    means = np.zeros(numeric_data.shape[1])
    for j in range(numeric_data.shape[1]):
        to_sum = numeric_data.iloc[:,j]
        indices = np.nonzero(~numeric_data.iloc[:,j].isnull())[0]
        correction = np.amax(to_sum[indices])
        to_sum /= correction
        for i in indices:
            means[j] += to_sum[i]
        means[j] /= indices.size
        means[j] *= correction
    return pd.Series(means, numeric_data.columns)

In [240]:
X_means = calculate_means(X[numeric_cols])
for i in X[numeric_cols].columns:
    X[numeric_cols] = X[numeric_cols].replace(np.nan, X_means[i])
X.head()

  return getattr(obj, method)(*args, **kwds)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [241]:
X[numeric_cols].shape

(891, 6)

In [242]:
X[numeric_cols].dropna().shape

(891, 6)

In [243]:
X_cat = X[categorical_cols]
X_cat.head()

Unnamed: 0,Sex,Ticket,Embarked,Cabin,Name
0,male,A/5 21171,S,,"Braund, Mr. Owen Harris"
1,female,PC 17599,C,C85,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,female,STON/O2. 3101282,S,,"Heikkinen, Miss. Laina"
3,female,113803,S,C123,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,male,373450,S,,"Allen, Mr. William Henry"


In [244]:
X_cat = X_cat.replace(np.nan, 'NaN')
X_cat.dropna().shape

(891, 5)

In [245]:
X_cat.shape

(891, 5)

In [246]:
X_cat.dropna().shape

(891, 5)

In [247]:
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

In [248]:
X = np.hstack([X[numeric_cols], X_cat_oh])

In [249]:
X.shape

(891, 1732)

In [250]:
X[:5]

array([[ 1.,  3., 22., ...,  0.,  0.,  0.],
       [ 2.,  1., 38., ...,  0.,  0.,  0.],
       [ 3.,  3., 26., ...,  0.,  0.,  0.],
       [ 4.,  1., 35., ...,  0.,  0.,  0.],
       [ 5.,  3., 35., ...,  0.,  0.,  0.]])

In [251]:
# preprocess dataset, split into training and test part
# X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.4, random_state=42)

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

In [252]:
score

0.7955182072829131

In [253]:
X_test = pd.read_csv('test.csv')
X_train = pd.read_csv('train.csv')
y = X_train['Survived']
X_train = X_train.drop('Survived', 1)

In [254]:
X_test.shape

(418, 11)

In [255]:
X_train.shape

(891, 11)

In [256]:
X = X_train.merge(X_test, how='outer')
X.shape

(1309, 11)

In [257]:
X_means = calculate_means(X[numeric_cols])
for i in X[numeric_cols].columns:
    X[numeric_cols] = X[numeric_cols].replace(np.nan, X_means[i])

  return getattr(obj, method)(*args, **kwds)


In [258]:
X = X.replace(np.nan, 'NaN')

In [259]:
X_cat = X[categorical_cols]

In [260]:
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

In [261]:
X = np.hstack([X[numeric_cols], X_cat_oh])
X.shape

(1309, 2435)

In [262]:
# preprocess dataset, split into training and test part
# X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X[:891], y, test_size=.4, random_state=42)

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

In [337]:
prediction = classifier.predict(X[891:])

In [338]:
prediction = pd.DataFrame(data=[X[891:,0], prediction]).T
prediction.columns = ['PassengerId', 'Survived']
prediction = prediction.astype('int')
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [339]:
prediction.to_csv('predict.csv', index=False)