In [1]:
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore') # turn off warnings

Download dataset and split to train and test sets

In [2]:
dataset = pandas.read_csv("datasets/Titanic_train.csv")
eval_set = pandas.read_csv("datasets/Titanic_test.csv") # set without target values
dataset, target = dataset.drop("Survived", axis=1), dataset['Survived'].copy()
X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.2, random_state=42)

Modify data

In [3]:
def data_preparation(X):
    X["Has_cabin"] = X["Cabin"].apply(lambda x: 1 if type(x) == str else 0)
    X['Embarked'] = X['Embarked'].fillna('S')
    X['Fare'] = X['Fare'].fillna(X['Fare'].median())
    X['Age'] = X['Age'].fillna(X['Age'].median())
    X["Sex"] = X["Sex"].apply(lambda x: 1 if x == "male" else 0)
    X["Name_length"] = X["Name"].apply(len)
    X['Embarked'] = X['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    X = X.drop(drop_elements, axis = 1)
        
    return X

X_train = data_preparation(X_train.copy())
X_test = data_preparation(X_test.copy())

X_train.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_cabin,Name_length
331,1,1,45.5,0,28.5,0,1,19
733,2,1,23.0,0,13.0,0,0,26
382,3,1,32.0,0,7.925,0,0,18
704,3,1,26.0,0,7.8542,0,0,23
813,3,0,6.0,2,31.275,0,0,34


Function, that returns a list of models

In [4]:
def models_list():
    rf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=16, n_jobs=-1, random_state=42)
    ex_tree = ExtraTreesClassifier(random_state=42)
    svc = SVC(probability=True)
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), learning_rate=0.5)
    gboost = GradientBoostingClassifier(max_depth=2, n_estimators=3)
    
    return [("rf", rf), ("ex_tree", ex_tree), ("svc", svc), ("ada", ada), ("gboost", gboost)]

Using soft voting classifier and grid_search

In [5]:
voting_clf = VotingClassifier(estimators=models_list(), voting='soft')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.8324022346368715

Run model on test set and save to csv file

In [6]:
eval_set_modified = data_preparation(eval_set.copy())
y_pred = voting_clf.predict(eval_set_modified)

In [8]:
import csv

f = open('datasets/submit.csv', 'w')

writer = csv.writer(f)

writer.writerow(["PassengerId", "Survived"])

for i, j in zip(list(eval_set['PassengerId']), list(y_pred)):
    writer.writerow([i, j])

f.close()