# Titanic: Machine Learning from Disaster

* Загрузим необходимые данные
* Выбрасывать данные с отсутствующими значениями это не для нас
  * Возьмем среднюю цену за билет для каждого класс
  * Возьмем моду для порта
  * С помощью регрессии найдем значения возрастов
  * Отбросим колонку с номером билета
* Введем дополнительную колонку с титулом человека
* Введем колонку размер семьи, как сумму братьев и детей/родителей
* Отнормируем числовые колонки

После всевозможных экспериментов лучше всего себя повел SVM с формулой

Survived ~ C(Pclass) + C(Sex) + FamilySize_scaled  + C(Embarked) + Fare_scaled + Age_scaled + C(Title)

дав на тестовой выборке результат 0.78947.

Вне конкурса, так как по заданию нельзя :), идет Random Forest с результатом в 0.79426

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from patsy import dmatrices
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.decomposition import PCA
import sklearn.preprocessing as preprocessing

In [2]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [16]:
def setMissingAges(age_df):
    formula = 'Age ~ C(Pclass) + C(Sex) + SibSp  + C(Embarked) + Fare + Parch + C(CabinLetter) + C(Title)' 
    age_df.loc[(age_df.Age.isnull()), 'Age'] = 0
    y,X = dmatrices(formula, data=age_df, return_type='dataframe')
    
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)    
    rtr.fit(X[y.Age != 0], np.asarray(y[y.Age != 0]).ravel())
    
    predictedAges = rtr.predict(X[y.Age == 0])
    age_df.loc[ (age_df.Age == 0), 'Age' ] = predictedAges
    return age_df

def prepare_data(dataFrame, scale=False):
    dataFrame = dataFrame.drop("Ticket", axis=1)
    fare_means = dataFrame.pivot_table('Fare', index='Pclass', aggfunc='mean')
    dataFrame['Fare'] = dataFrame[['Fare', 'Pclass']].apply(
                lambda x: fare_means[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    
    dataFrame.loc[(dataFrame.Embarked.isnull()), 'Embarked'] = dataFrame.Embarked.dropna().mode().values
    
    dataFrame.loc[(dataFrame.Cabin.isnull()), 'Cabin'] = 'U0'
    dataFrame['CabinLetter'] = dataFrame['Cabin'].map( lambda x : re.compile("([a-zA-Z]+)").search(x).group())
    dataFrame['CabinLetter'] = pd.factorize(dataFrame['CabinLetter'])[0]

    dataFrame['Title'] = dataFrame['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
    dataFrame.loc[(dataFrame.Title == 'Jonkheer'), 'Title'] = 'Master'
    dataFrame.loc[(dataFrame.Title.isin(['Ms','Mlle'])), 'Title'] = 'Miss'
    dataFrame.loc[(dataFrame.Title == 'Mme'), 'Title'] = 'Mrs'
    dataFrame.loc[(dataFrame.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])), 'Title'] = 'Sir'
    dataFrame.loc[(dataFrame.Title.isin(['Dona', 'Lady', 'the Countess'])), 'Title'] = 'Lady'
    
    dataFrame['FamilySize'] = dataFrame.Parch + dataFrame.SibSp
    dataFrame = setMissingAges(dataFrame)
    
    if scale:
        scaler = preprocessing.StandardScaler()
        dataFrame['Age_scaled'] = scaler.fit_transform(dataFrame['Age'].astype(float))
        dataFrame['Fare_scaled'] = scaler.fit_transform(dataFrame['Fare'].astype(float))
        dataFrame['FamilySize_scaled'] = scaler.fit_transform(dataFrame['FamilySize'].astype(float))
    
    return dataFrame

In [260]:
df_test['Survived'] = -1
full_df = pd.concat([df, df_test])
prepared_df = setMissingAges(prepare_data(full_df))

formula = 'Survived ~ C(Pclass) + C(Sex) + FamilySize  + C(Embarked) + Fare + Age + C(Title)'
y,X = dmatrices(formula, data=prepared_df, return_type='dataframe')

parameter_grid = {
#     "criterion": ["gini", "entropy"],
    "max_depth": [None,3,5,10],
    'max_features': [0.5, 0.7, 1.],
    'min_samples_leaf': [5,7,10,15,20]
}
                                            
classifier = DecisionTreeClassifier()                                               
grid_search = GridSearchCV(classifier, parameter_grid, n_jobs=-1, cv=10)
grid_search.fit(X[y.Survived != -1], np.asarray(y[y.Survived != -1]).ravel())
best_params = sorted(grid_search.grid_scores_, key=lambda x: x[1], reverse=True)[:10]
best_params

[mean: 0.82716, std: 0.02364, params: {'max_features': 0.7, 'max_depth': 3, 'min_samples_leaf': 7},
 mean: 0.82379, std: 0.03196, params: {'max_features': 0.5, 'max_depth': 5, 'min_samples_leaf': 5},
 mean: 0.82379, std: 0.03973, params: {'max_features': 0.5, 'max_depth': 5, 'min_samples_leaf': 7},
 mean: 0.82379, std: 0.03531, params: {'max_features': 0.7, 'max_depth': 5, 'min_samples_leaf': 10},
 mean: 0.82267, std: 0.02460, params: {'max_features': 0.5, 'max_depth': 3, 'min_samples_leaf': 15},
 mean: 0.82267, std: 0.02338, params: {'max_features': 0.7, 'max_depth': 3, 'min_samples_leaf': 5},
 mean: 0.82267, std: 0.02169, params: {'max_features': 1.0, 'max_depth': 3, 'min_samples_leaf': 5},
 mean: 0.82155, std: 0.03214, params: {'max_features': 0.7, 'max_depth': 3, 'min_samples_leaf': 10},
 mean: 0.82155, std: 0.03199, params: {'max_features': 0.5, 'max_depth': 10, 'min_samples_leaf': 15},
 mean: 0.82043, std: 0.03415, params: {'max_features': 0.7, 'max_depth': 5, 'min_samples_leaf':

In [20]:
df_test['Survived'] = -1
full_df = pd.concat([df, df_test])
prepared_df = prepare_data(full_df, scale=True)

formula = 'Survived ~ C(Pclass) + C(Sex) + FamilySize_scaled  + C(Embarked) + Fare_scaled + Age_scaled + C(Title)'
y,X = dmatrices(formula, data=prepared_df, return_type='dataframe')

parameter_grid = {
    "kernel": ["linear", "poly", "rbf"],
    "gamma": [0.0, 3.0]
}
                                            
classifier = svm.SVC()
grid_search = GridSearchCV(classifier, parameter_grid, n_jobs=-1, cv=10)
grid_search.fit(X[y.Survived != -1], np.asarray(y[y.Survived != -1]).ravel())
best_params = sorted(grid_search.grid_scores_, key=lambda x: x[1], reverse=True)[:10]
best_params

[mean: 0.83614, std: 0.03195, params: {'kernel': 'rbf', 'gamma': 0.0},
 mean: 0.82828, std: 0.03070, params: {'kernel': 'poly', 'gamma': 0.0},
 mean: 0.82716, std: 0.02975, params: {'kernel': 'linear', 'gamma': 0.0},
 mean: 0.82716, std: 0.02975, params: {'kernel': 'linear', 'gamma': 3.0},
 mean: 0.80022, std: 0.04349, params: {'kernel': 'rbf', 'gamma': 3.0},
 mean: 0.79798, std: 0.02805, params: {'kernel': 'poly', 'gamma': 3.0}]

In [24]:
df_test['Survived'] = -1
full_df = pd.concat([df, df_test])
prepared_df = prepare_data(full_df, scale=True)

formula = 'Survived ~ C(Pclass) + C(Sex) + FamilySize_scaled  + C(Embarked) + Fare_scaled + Age_scaled + C(Title)'
y,X = dmatrices(formula, data=prepared_df, return_type='dataframe')

classifier = svm.SVC(kernel="rbf", gamma=0.0)
classifier.fit(X[y.Survived != -1], np.asarray(y[y.Survived != -1]).ravel())
result = classifier.predict(X[y.Survived == -1])

df_test['Survived'] = result
df_test[['PassengerId','Survived']].head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [26]:
df_test[['PassengerId','Survived']].astype(int).to_csv('titanic_SVM.csv', index=False)

In [220]:
# df_test['Survived'] = -1
# full_df = pd.concat([df, df_test])
# prepared_df = setMissingAges(prepare_data(full_df))

# formula = 'Survived ~ C(Pclass) + C(Sex) + SibSp  + C(Embarked) + Fare + Age + Parch + C(CabinLetter) + C(Title)'
# y,X = dmatrices(formula, data=prepared_df, return_type='dataframe')

# pca = PCA()
# X_transformed = pca.fit_transform(X[y.Survived != -1],y[y.Survived != -1])
# # pcaDataFrame = d.DataFrame(X_transformed)

# parameter_grid = {
# #     "criterion": ["gini", "entropy"],
#     "max_depth": [None, 3, 5, 10],
#     'max_features': [0.5, 1.],
#     'min_samples_leaf': [1,3,5,7,10,15,20]
# }
                                            
# classifier = DecisionTreeClassifier()                                               
# grid_search = GridSearchCV(classifier, parameter_grid, n_jobs=-1, cv=10)
# grid_search.fit(X_transformed, np.asarray(y[y.Survived != -1]).ravel())
# best_params = sorted(grid_search.grid_scores_, key=lambda x: x[1], reverse=True)[:10]
# best_params