In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("../ML Titanic/train.csv", dtype={"Age": np.float64}, )
test_data = pd.read_csv("../ML Titanic/test.csv", dtype={"Age": np.float64}, )

In [3]:
import re

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor

# Functions
def binarizer(x,one,zero):
    if x==one:
        return 1
    elif x==zero:
        return 0

def cabin_count(x):
    count = 0
    for i in x.split(' '):
        if i !='0':
            count+=1
    return count

def age_impute(row):
    if row['Age'] != 0:
        return row['Age']
    elif row['Age'] == 0:
        return train_data.Age[(train_data.Age.notnull())&(train_data.Pclass==row['Pclass'])&(train_data.Sex==row['Sex'])].mean()

def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [4]:
combined2 = pd.concat([train_data, test_data], axis=0)

combined2.Embarked.fillna('S', inplace=True)

combined2.Fare[combined2.Fare.isnull()] = np.median(combined2.Fare[combined2.Fare.notnull()])

combined2['Title'] = combined2["Name"].apply(get_title)
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 7, "Dona":10, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 7, "Capt": 7, "Ms": 2}
combined2["TitleCat"] = combined2.loc[:,'Title'].map(title_mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
combined2.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,TitleCat
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs,3
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss,2
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs,3
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr,1


In [6]:
pd.Categorical(combined2.Cabin.fillna('0').apply(lambda x: x[0])).codes

array([0, 3, 0, ..., 0, 0, 0], dtype=int8)

In [7]:
combined2['CabinCat'] = pd.Categorical(combined2.Cabin.fillna('0').apply(lambda x: x[0])).codes
combined2['CabinBlock'] = combined2.Cabin.fillna('0').apply(lambda x: x[0])

combined2.Cabin.fillna('0', inplace=True)

combined2['EmbarkedCat'] = pd.Categorical(combined2.Embarked).codes

combined2.drop(['Ticket'], axis=1, inplace=True)

In [8]:
combined2.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title,TitleCat,CabinCat,CabinBlock,EmbarkedCat
0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,Mr,1,0,0,2
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,Mrs,3,3,C,0
2,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,Miss,2,0,0,2
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,Mrs,3,3,C,2
4,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,Mr,1,0,0,2


In [9]:
full_data = pd.concat([combined2.drop(['Survived'],axis=1),
                       pd.get_dummies(combined2.Sex, prefix='Sex'),
                       pd.get_dummies(combined2.Embarked, prefix='Embarked'),
                       pd.get_dummies(combined2.Pclass, prefix='Pclass'),
                       pd.get_dummies(combined2.TitleCat, prefix='TitleCat'),
                       pd.get_dummies(combined2.CabinBlock, prefix='CabinBlock'),
                       combined2.Survived], axis=1)

In [10]:
full_data

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,CabinBlock_0,CabinBlock_A,CabinBlock_B,CabinBlock_C,CabinBlock_D,CabinBlock_E,CabinBlock_F,CabinBlock_G,CabinBlock_T,Survived
0,22.0,0,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,...,1,0,0,0,0,0,0,0,0,0.0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,0,0,1,0,0,0,0,0,1.0
2,26.0,0,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,...,1,0,0,0,0,0,0,0,0,1.0
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,0,0,1,0,0,0,0,0,1.0
4,35.0,0,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,...,1,0,0,0,0,0,0,0,0,0.0
5,,0,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,...,1,0,0,0,0,0,0,0,0,0.0
6,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,...,0,0,0,0,0,1,0,0,0,0.0
7,2.0,0,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,...,1,0,0,0,0,0,0,0,0,0.0
8,27.0,0,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,...,1,0,0,0,0,0,0,0,0,1.0
9,14.0,0,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,...,1,0,0,0,0,0,0,0,0,1.0


In [11]:
full_data['FamilySize'] = full_data["SibSp"] + full_data["Parch"]
full_data['NameLength'] = full_data.Name.apply(lambda x: len(x))

full_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,CabinBlock_B,CabinBlock_C,CabinBlock_D,CabinBlock_E,CabinBlock_F,CabinBlock_G,CabinBlock_T,Survived,FamilySize,NameLength
0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,0,0,0,0,0,0,0,0.0,1,23
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,1,0,0,0,0,0,1.0,1,51
2,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,0,0,0,0,0,1.0,0,22
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,1,0,0,0,0,0,1.0,1,44
4,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,0,0,0,0,0,0,0,0.0,0,24


In [12]:
import operator

family_id_mapping = {}

def get_family_id(row):
    last_name = row["Name"].split(",")[0]
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

family_ids = full_data.apply(get_family_id, axis=1)

In [13]:
len(family_ids.unique())

928

In [14]:
# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[full_data["FamilySize"] < 3] = -1

full_data["FamilyId"] = family_ids

In [15]:
child_age = 14

def get_person(passenger):
    """
    Returns a person value of 'female_adult', 'male_adult', 'child'.
    """
    age, sex = passenger
    
    if (age < child_age):
        return 'child'
    elif (sex == 'female'):
        return 'female_adult'
    else:
        return 'male_adult'
    
full_data = pd.concat([full_data, pd.DataFrame(full_data[['Age', 'Sex']].apply(get_person, axis=1), columns=['person'])],axis=1)


In [16]:
full_data.columns.values

array(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Title', 'TitleCat', 'CabinCat',
       'CabinBlock', 'EmbarkedCat', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'TitleCat_1', 'TitleCat_2', 'TitleCat_3', 'TitleCat_4',
       'TitleCat_5', 'TitleCat_6', 'TitleCat_7', 'TitleCat_8',
       'TitleCat_10', 'CabinBlock_0', 'CabinBlock_A', 'CabinBlock_B',
       'CabinBlock_C', 'CabinBlock_D', 'CabinBlock_E', 'CabinBlock_F',
       'CabinBlock_G', 'CabinBlock_T', 'Survived', 'FamilySize',
       'NameLength', 'FamilyId', 'person'], dtype=object)

In [17]:
dummies = pd.get_dummies(full_data['person'])
full_data = pd.concat([full_data,dummies],axis=1)
full_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,CabinBlock_G,CabinBlock_T,Survived,FamilySize,NameLength,FamilyId,person,child,female_adult,male_adult
0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,0,0,0.0,1,23,-1,male_adult,0,0,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,0,1.0,1,51,-1,female_adult,0,1,0
2,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,1.0,0,22,-1,female_adult,0,1,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,0,1.0,1,44,-1,female_adult,0,1,0
4,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,0,0,0.0,0,24,-1,male_adult,0,0,1


In [18]:
def process_surname(nm):
    return nm.split(',')[0].lower()

full_data['surname'] = full_data['Name'].apply(process_surname)

In [19]:
set(full_data[(full_data.female_adult == 1.0) &
                                     (full_data.Survived == 0.0) &
                                     ((full_data.Parch > 0) | (full_data.SibSp > 0))]['surname'].values)

{'ahlin',
 'allison',
 'andersson',
 'arnold-franchi',
 'barbara',
 'boulos',
 'bourke',
 'caram',
 'carter',
 'danbom',
 'ford',
 'goodwin',
 'ilmakangas',
 'johnston',
 'jussila',
 'lahtinen',
 'lefebre',
 'lobb',
 'palsson',
 'panula',
 'rice',
 'robins',
 'rosblom',
 'sage',
 'skoog',
 'strom',
 'turpin',
 'van impe',
 'vander planke',
 'zabour'}

In [20]:
perishing_female_surnames = list(set(full_data[(full_data.female_adult == 1.0) &
                                     (full_data.Survived == 0.0) &
                                     ((full_data.Parch > 0) | (full_data.SibSp > 0))]['surname'].values))

def perishing_mother_wife(passenger): 
    surname, Pclass, person = passenger
    return 1.0 if (surname in perishing_female_surnames) else 0.0

full_data['perishing_mother_wife'] = full_data[['surname', 'Pclass', 'person']].apply(perishing_mother_wife, axis=1)


surviving_male_surnames = list(set(full_data[(full_data.male_adult == 1.0) &
                                     (full_data.Survived == 1.0) &
                                     ((full_data.Parch > 0) | (full_data.SibSp > 0))]['surname'].values))

def surviving_father_husband(passenger): 
    surname, Pclass, person = passenger
    return 1.0 if (surname in surviving_male_surnames) else 0.0

full_data['surviving_father_husband'] = full_data[['surname', 'Pclass', 'person']].apply(surviving_father_husband, axis=1)


In [27]:
full_data['surviving_father_husband'].value_counts()

0.0    1253
1.0      56
Name: surviving_father_husband, dtype: int64

In [28]:
full_data['perishing_mother_wife'].value_counts()

0.0    1187
1.0     122
Name: perishing_mother_wife, dtype: int64

In [21]:
classers = ['male_adult', 'female_adult', 'child','perishing_mother_wife','surviving_father_husband','Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId']

age_et = ExtraTreesRegressor(n_estimators=200)

X_train = full_data.loc[full_data.Age.notnull(),classers]
Y_train = full_data.loc[full_data.Age.notnull(),['Age']]
X_test = full_data.loc[full_data.Age.isnull(),classers]

In [22]:
np.ravel(Y_train)

array([ 22. ,  38. ,  26. , ...,  28. ,  39. ,  38.5])

In [23]:
age_et.fit(X_train,np.ravel(Y_train))
age_preds = age_et.predict(X_test)

In [24]:
full_data.loc[full_data.Age.isnull(),['Age']] = age_preds

In [25]:
model_dummys = ['Age','male_adult', 'female_adult', 'child','perishing_mother_wife','surviving_father_husband','Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId']

model_gb = RandomForestClassifier(n_estimators=20000, min_samples_leaf=4, class_weight={0:0.69,1:0.31})

X_data = full_data.iloc[:891,:]
X_train = X_data.loc[:,model_dummys]

Y_data = full_data.iloc[:891,:]
Y_train = Y_data.loc[:,['Survived']]

X_t_data = full_data.iloc[891:,:]
X_test = X_t_data.loc[:,model_dummys]


model_gb.fit(X_train, np.ravel(Y_train))
model_results = model_gb.predict(X_test)

model_results = [str(int(x)) for x in model_results]

submission = pd.DataFrame()
submission['PassengerId'] = X_t_data.PassengerId
submission['Survived'] = model_results
submission.set_index(['PassengerId'],inplace=True, drop=True)
submission.head(3)

submission.to_csv('submission_x.csv')
