In [1356]:
import numpy as np
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier

TITANIC_PATH = os.path.join("datasets", "titanic")


def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [1357]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [1358]:
print(test_data.info())
print(train_data.shape, test_data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
((891, 12), (418, 11))


In [1359]:
from sklearn import preprocessing
def preprocess(data):
    le = preprocessing.LabelEncoder()
    cols = ['Ticket', 'Cabin']
    data = data.drop(cols, axis=1)
    data["Age"]=le.fit_transform(data["Age"])
    data["Sex"]=le.fit_transform(data["Sex"])
    data["Embarked"]=le.fit_transform(data["Embarked"])
    data["Fare"] = le.fit_transform(data["Fare"])
    data['Title']=0
    for pers in data:
        data['Title'] = data.Name.str.extract('([A-Za-z]+)\.')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    #data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    print(data.Title.unique())
    data['Title'] = le.fit_transform(data['Title'])
    data = data.drop('Name', axis=1)
    return data

train_data = preprocess(train_data)
test_data = preprocess(test_data)

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Major' 'Lady' 'Sir' 'Col'
 'Capt' 'Countess' 'Jonkheer']
['Mr' 'Mrs' 'Miss' 'Master' 'Col' 'Rev' 'Dr' 'Dona']


In [1360]:
train_data.Title.unique()

array([10, 11,  9,  8,  3, 12,  4,  7,  6, 13,  1,  0,  2,  5])

In [1361]:
X_test  = test_data.drop('PassengerId',axis=1)
X_train = train_data.drop(['Survived','PassengerId'], axis=1)
y_train = train_data['Survived'].values

In [1362]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.2)
all_features = X_train1.columns
all_transformer = Pipeline(steps = [
    ('stdscaler', StandardScaler())
])

all_preprocess = ColumnTransformer(
    transformers = [
        ('allfeatures', all_transformer, all_features),
    ]
)

classifiers = [LogisticRegression(),RandomForestClassifier(random_state=42),KNeighborsClassifier()]
first_round_scores = {}
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', all_preprocess),
                      ('classifier', classifier)])
    pipe.fit(X_train1, y_train1)
    print(classifier)
    score = pipe.score(X_test1, y_test1)
    first_round_scores[classifier.__class__.__name__[:10]] = score
    print("model score: %.3f" % score)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
model score: 0.816
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
model score: 0.810
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
model score: 0.771


In [1363]:
BestClassifier = classifiers[0]
BestClassifier.fit(X_train,y_train)
y_pred = BestClassifier.predict(X_test)

In [1364]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('./output/submission.csv', index=False)

In [1365]:
correct_answers = load_titanic_data('gender_submission.csv',os.path.join("output"))

In [1366]:
a = submission[submission.eq(correct_answers).all(axis=1) == True]
print (a.shape[0])
print(float(a.shape[0])/submission.shape[0])

408
0.976076555024
