In [21]:
import pandas

titanic = pandas.read_csv("kaggle_titanic/train.csv")
test = pandas.read_csv("kaggle_titanic/test.csv")

In [22]:
# preprocessing

import re
from sklearn import preprocessing

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def get_age_group(age):
    if age < 15:
        return 0
    if age < 50:
        return 1
    return 2

titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic["Embarked"] = titanic["Embarked"].fillna("S")

# new features
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))
titanic["Title"] = titanic["Name"].apply(get_title)
titanic["Age"]  = titanic["Age"].apply(get_age_group)

columns_labels = preprocessing.LabelEncoder()
train_columns_to_encode = ['Sex', 'Embarked', 'Title']

for col in train_columns_to_encode:
    titanic[col] = columns_labels.fit_transform(titanic[col])
    

In [23]:
# features exctractin

from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, f_classif

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title"]

# selector = SelectKBest(f_classif, k=5)
# selector.fit(titanic[predictors], titanic["Survived"])


In [34]:
# classification and cross validation

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
import numpy as np
from matplotlib import pyplot as plt

# alg = LogisticRegression(random_state=1)

score_means = []
for i in range(10, 200, 10):
    alg = AdaBoostClassifier(n_estimators=i)
#     alg = KNeighborsClassifier(n_neighbors=i)
#     alg = RandomForestClassifier(random_state=0, n_estimators=i, min_samples_split=4, min_samples_leaf=2)
    scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
    score_means.append(scores.mean())
plt.plot(score_means)
print(score_means)


[0.80695847362514039, 0.80808080808080807, 0.81593714927048255, 0.8125701459034792, 0.8136924803591471, 0.81032547699214375, 0.81032547699214363, 0.80583613916947261, 0.80695847362514028, 0.80808080808080807, 0.80920314253647596, 0.81144781144781142, 0.80808080808080807, 0.81144781144781142, 0.80695847362514039, 0.81032547699214375, 0.81032547699214375, 0.81032547699214375, 0.8125701459034792]


In [12]:
# processing test data 

test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"] = test["Embarked"].fillna("S")
test["Fare"] = test["Fare"].fillna(titanic["Fare"].median())

# new features
test["FamilySize"] = test["SibSp"] + test["Parch"]
test["NameLength"] = test["Name"].apply(lambda x: len(x))
test["Title"] = test["Name"].apply(get_title)

for col in train_columns_to_encode:
    test[col] = columns_labels.fit_transform(test[col])


In [13]:
# predict test data

alg.fit(titanic[predictors], titanic["Survived"])
predictions = alg.predict(test[predictors])

submission = pandas.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle_titanic/submission.csv", index=False)