In [1]:
#### data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# machine learning
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

combine = [train_df, test_df]

In [3]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

In [4]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] < 20, 'Fare'] = 0
    dataset.loc[dataset['Fare'] >= 20, 'Fare'] = 1
    dataset['Fare'] = dataset['Fare'].astype(int)

In [6]:
for dataset in combine:
    dataset['Cabin_not_present'] = dataset['Cabin'].isnull().values
    dataset["Cabin_not_present"] = dataset["Cabin_not_present"].astype(int)

In [7]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)
combine = [train_df, test_df]

In [8]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [9]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [10]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [11]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [12]:
data_df = train_df.append(test_df)
data_df['LastName'] = data_df.Name.str.extract('([A-Za-z]+),', expand=False)
data_df['Fare'].fillna(data_df['Fare'].median(), inplace=True)

DEFAULT_SURVIVAL_VALUE = 0.5
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df.groupby(['LastName', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

  data_df = train_df.append(test_df)


Number of passengers with family survival information: 514


In [13]:
train_df['Family_Survival'] = data_df['Family_Survival'][:891]
test_df['Family_Survival'] = data_df['Family_Survival'][891:]

In [14]:
train_df = train_df.drop(['Name','PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

((891, 12), (418, 12))

In [15]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_not_present,Title,Family_Survival
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.255496,-0.316912,0.407753,0.266836
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.652835,0.725541,-0.173929,-0.259495
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.187864,-0.140391,0.502713,0.013458
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.127828,-0.249732,-0.104766,0.058051
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.359494,0.04046,0.269623,-0.174454
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.376642,-0.036987,0.315784,-0.008292
Fare,0.255496,-0.652835,0.187864,0.127828,0.359494,0.376642,1.0,-0.49712,0.285035,0.103335
Cabin_not_present,-0.316912,0.725541,-0.140391,-0.249732,0.04046,-0.036987,-0.49712,1.0,-0.13339,-0.200943
Title,0.407753,-0.173929,0.502713,-0.104766,0.269623,0.315784,0.285035,-0.13339,1.0,0.044016
Family_Survival,0.266836,-0.259495,0.013458,0.058051,-0.174454,-0.008292,0.103335,-0.200943,0.044016,1.0


In [16]:
train_df = train_df.drop(['Parch', 'SibSp', 'Age', 'Embarked','Ticket'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'Age','Embarked','Ticket'], axis=1)

In [17]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [18]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)

In [20]:
A = np.column_stack((Y_pred,))

In [21]:
# KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

In [22]:
A = np.column_stack((A,Y_pred))

In [23]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

In [24]:
A = np.column_stack((A,Y_pred))

In [25]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

In [26]:
A = np.column_stack((A,Y_pred))

In [27]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [28]:
A = np.column_stack((A,Y_pred))

In [33]:
# Neural network with sklearn
mlp = MLPClassifier(hidden_layer_sizes=(8,16,16,8), max_iter=10000, solver='lbfgs', random_state=1)
mlp.fit(X_train, Y_train)
Y_pred = mlp.predict(X_test)
acc_mlp = round(mlp.score(X_train, Y_train) * 100, 2)

In [30]:
A = np.column_stack((A,Y_pred))

In [31]:
avg = np.round(np.mean(A, axis=1)).astype(int)

In [32]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": avg
    })
submission.to_csv('improved.csv', index=False)