In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re

In [37]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Store our passenger ID for easy access
PassengerId = test['PassengerId']

In [38]:
full_data = [train, test]

# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

In [39]:
# Credit to Sina for her Titanic Best Working Classifier!

# create family size variable
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# If family size == 1, a person is traveling by his/herself and for that is counted as 'Alone'
for dataset in full_data:
    dataset['IsAlone'] = 0 # default to 0, meaning not alone
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 # Check for no companions
    
# If no information about embarkment, assume they joined in S
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

# Use median imputation method for Fare column
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# Impute missing Age data
for dataset in full_data:
    avg_age = dataset['Age'].mean()
    std_age = dataset['Age'].std()
    null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(avg_age - std_age, avg_age + std_age, size=null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# define regular expression function to extract a person's title
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # if a title was found:
    if title_search:
        return title_search.group(1)
    return ""

# Create a new Column for Title
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(extract_title)
    
# since there are quite a lot of rare titles (e.g. 'Countess'), we'll group them as 'rare'
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') # Also replace french titles with common english abr.
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Now we have to transform everything categorical into numerical values by mapping
for dataset in full_data:
    
    # Transform gender into 0 for female and 1 for male
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    # Transform Age to a few categories
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[ (dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age' ] = 1
    dataset.loc[ (dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age' ] = 2
    dataset.loc[ (dataset['Age'] > 38) & (dataset['Age'] <= 64), 'Age' ] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age' ] = 4
    
    # Map titles to values
    title_map = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_map)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Map City of Embarkment to value
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [40]:
# Now we'll drop any variables we don't need for our predictions 
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [41]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier

In [42]:
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
X_train = train.values # Creates an array of the train data
X_test = test.values # Creats an array of the test data

In [51]:
# Initialize classifiers
clf1 = RandomForestClassifier(n_estimators=500)
clf2 = ExtraTreesClassifier(n_estimators=500)
clf3 = AdaBoostClassifier(n_estimators=500, learning_rate=0.75)
clf4 = GradientBoostingClassifier(n_estimators=500)
svc = SVC(kernel='linear', C=0.025)

# initialize stacking classifier with all 3 classifiers + meta classifier (logistic regression)
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4],
                          meta_classifier=svc)

In [52]:
print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, clf4, sclf], 
                      ['Random Forest', 
                       'ExtraTrees', 
                       'AdaBoost',
                       'GradientBoost',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X_train, y_train, 
                                              cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.81 (+/- 0.05) [Random Forest]
Accuracy: 0.80 (+/- 0.04) [ExtraTrees]
Accuracy: 0.82 (+/- 0.02) [AdaBoost]
Accuracy: 0.82 (+/- 0.04) [GradientBoost]
Accuracy: 0.81 (+/- 0.05) [StackingClassifier]
