In [1]:
# This is a machine learning project for the titanic Kaggle competition. 
# https://www.kaggle.com/c/titanic 
# The goal is to predict whether a passenger survived using the competition data which are mostly socioeconomical. 
# The training set must be used to train the machine learning instance while the test set is used to obtain the final score.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display  # This allow to display dataframe even if they are not the last thing of their cell.

# Read data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine_df = train_df.append(test_df)

# Print features
print(list(combine_df))
# Show small sample of data
combine_df.head()

['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket']


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [2]:
# See missing data and data type
train_df.info()
print('_'*40)
test_df.info()

# There are 891 training cases and 418 test cases. Age and cabin have a large number of NaNs. 
# Embarked has 2 NaN in the training data and fare has 1 in the testing data.
# All those NaN represents unknown values, XGBoost (the most used machine learning tools in Kaggle competition) can handdle them
# by itself, but other machine learning classifier needs that those values be replaced or removed.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null

In [3]:
# information about the distribution of rhw numerical data
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# information about categorical data
train_df.describe(include=['O'])
# Ticket and cabin have a very large number of unique cases. They can't be used at is. Embarked has 3 cases, thus can be used
# as dummy variables, while Sex can be one dummy variable.

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Mangan, Miss. Mary",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [8]:
# Study ticket feature
tmp = train_df.set_index('Ticket').sort_index()
print(tmp['Fare'])

# Ticket numbers and their fare are repeated. The best explanation is that when someone buy places for a group of people, there
# is only one ticket, which has only one fare which is for the group. Thus, a more representative informartion might be the fare
# per person. It is also possible that giving different ponderation for men, women and children would give a better predictor.

Ticket
110152               86.5000
110152               86.5000
110152               86.5000
110413               79.6500
110413               79.6500
110413               79.6500
110465               52.0000
110465               52.0000
110564               26.5500
110813               75.2500
111240               33.5000
111320               38.5000
111361               57.9792
111361               57.9792
111369               30.0000
111426               26.5500
111427               26.5500
111428               26.5500
112050                0.0000
112052                0.0000
112053               30.0000
112058                0.0000
112059                0.0000
112277               31.0000
112379               39.6000
113028               26.5500
113043               28.5000
113050               26.5500
113051               27.7500
113055               26.5500
                      ...   
STON/O 2. 3101274     7.1250
STON/O 2. 3101275     7.1250
STON/O 2. 3101280     7.9250
STON/O 

In [9]:
# Count number of people with same tickets.
combine_df = combine_df.join(combine_df.groupby(by='Ticket').size().rename('Ticket_count'),on='Ticket')
# Fare most likely means fare of the ticket rather than fare per person. 
# So I need to create this feature as it might be more useful than fare
combine_df['FarePerPerson']=combine_df['Fare']/combine_df['Ticket_count']
# The sum of Sibling, spouse, parent and children might be more useful than having Sibling and spouse, then parent and children
combine_df['Family'] = combine_df['SibSp'] + combine_df['Parch']
# One feature is enough for sex, no need to break into two when doing dummies.
combine_df['Sex'] = combine_df['Sex'].map( {'female': 1, 'male': 0} )
# PClass is not a real number, let's transform it into dummies.
combine_df['Pclass'] = combine_df['Pclass'].replace({1 : '1st',2 : '2nd',3 : '3rd'})

# Create title row. I will transform it with get dummies after that
# Titles in names are certainly useful, but there are too many instances. So it is useful to combine the rare titles together.
# Also titles related to unmarried women (Miss., Mlle.) should be together, idem for those of married women (Mrs. Mme.). 
# Ms. which is for unknown marital status may go either way.
def fun0(x):
    tmp = (x.split(',')[1]).split()[0]
    if tmp in ['Mr.','Master.','Dr.','Rev.']:
        return tmp
    elif tmp in ['Miss.', 'Mlle.', 'Ms.']:
        return 'Miss.'
    elif tmp in ['Mrs.', 'Mme.']:
        return 'Mrs.'
    else:
        return 'Other'
combine_df['title'] = combine_df['Name'].apply(fun0)

In [10]:
# Create same ticket survival
# People are more likely to die if the people who shared their ticket died.
# Especially if there are men and some women or children who shared their ticket died. It's not so true in the opposite direction
# since so many men died.
tmp1 = (combine_df.groupby(by='Ticket').count())['Survived']
tmp2 = (combine_df.groupby(by='Ticket').sum())['Survived']
combine_df['tmp1'] = combine_df['Ticket'].replace(to_replace=tmp1)   # Number of non_nan survived with same ticket
combine_df['tmp2'] = combine_df['Ticket'].replace(to_replace=tmp2)   # Sum of non_nan survived with same ticket

def f(row):
    if np.isnan(row['Survived']): # test
        if row['tmp1'] == 0 :
            val = np.nan
        else:
            val = row['tmp2'] /row['tmp1']
    else: # train
        if row['tmp1'] == 1 :
            val = np.nan
        else:
            val = (row['tmp2']-row['Survived']) / (row['tmp1']-1)
    return val
combine_df['SameTicketSurvival'] = combine_df.apply(f, axis=1)
combine_df = combine_df.drop('tmp1',axis=1).drop('tmp2',axis=1)

# Create same last name survival
# The same is true with last name, as they are likely to be in the same family.
combine_df['LastName'] = combine_df['Name'].apply(lambda x : (x.split(',')[0]))
tmp1 = (combine_df.groupby(by='LastName').count())['Survived']
tmp2 = (combine_df.groupby(by='LastName').sum())['Survived']
combine_df['tmp1'] = combine_df['LastName'].replace(to_replace=tmp1)   # Number of non_nan survived with same ticket
combine_df['tmp2'] = combine_df['LastName'].replace(to_replace=tmp2)   # Sum of non_nan survived with same ticket
combine_df['LastNameSurvival'] = combine_df.apply(f, axis=1)

display((combine_df.head())

# Remove LastName because there are two many different LastName, so it wouldn't work with get_dummies which is needed for xgboost.
# Same for ticket
# Same for Cabin, which has also too many missing values.
# PassengerID should not add any information as it is arbitrary
# Name is useless as everybody as a unique name.
combine_df = (combine_df.drop('tmp1',axis=1).drop('tmp2',axis=1).drop('LastName',axis=1).drop('Ticket',axis=1).
              drop('PassengerId',axis=1).drop('Cabin',axis=1).drop('Name',axis=1))
        
display((combine_df.head())

print(list(combine_df))

    Age Cabin Embarked     Fare  \
0  22.0   NaN        S   7.2500   
1  38.0   C85        C  71.2833   
2  26.0   NaN        S   7.9250   
3  35.0  C123        S  53.1000   
4  35.0   NaN        S   8.0500   

                                                Name  Parch  PassengerId  \
0                            Braund, Mr. Owen Harris      0            1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...      0            2   
2                             Heikkinen, Miss. Laina      0            3   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)      0            4   
4                           Allen, Mr. William Henry      0            5   

  Pclass  Sex  SibSp        ...                   Ticket Ticket_count  \
0    3rd    0      1        ...                A/5 21171            1   
1    1st    1      1        ...                 PC 17599            2   
2    3rd    1      0        ...         STON/O2. 3101282            1   
3    1st    1      1        ...         

In [74]:
# Create dummies for every string variable
combine_df = pd.get_dummies(combine_df)

# Split Other between gender
# That might be useful and the other title are mostly separated by gender.
print(list(combine_df))
combine_df['OtherFemale'] = combine_df['title_Other'] * combine_df['Sex'] + combine_df['title_Dr.'] * combine_df['Sex']
combine_df['OtherMale'] = combine_df['title_Other'] - combine_df['OtherFemale']

# Split classes between gender
# That might be useful since gender is a very important characteristic which interacts with the class since the boosted tree
# technic won't allow to always use the Sex characteristic to do a branching.
combine_df['1F'] = combine_df['Pclass_1st'] * combine_df['Sex']
combine_df['2F'] = combine_df['Pclass_2nd'] * combine_df['Sex']
combine_df['3F'] = combine_df['Pclass_3rd'] * combine_df['Sex']
combine_df['1M'] = combine_df['Pclass_1st'] - combine_df['1F']
combine_df['2M'] = combine_df['Pclass_2nd'] - combine_df['2F']
combine_df['3M'] = combine_df['Pclass_3rd'] - combine_df['3F']

['Age', 'Fare', 'Parch', 'Sex', 'SibSp', 'Survived', 'Ticket_count', 'FarePerPerson', 'Family', 'SameTicketSurvival', 'LastNameSurvival', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1st', 'Pclass_2nd', 'Pclass_3rd', 'title_Dr.', 'title_Master.', 'title_Miss.', 'title_Mr.', 'title_Mrs.', 'title_Other', 'title_Rev.']


In [75]:
# Separate in train and test
train_df = combine_df[pd.notnull(combine_df['Survived'])]
test_df = combine_df[False == pd.notnull(combine_df['Survived'])]

x = train_df.drop('Survived',axis=1)
y = train_df['Survived']

In [11]:
# XGBoost is known to be the best machine learning algorithm for this kind of task.
# Hyperopt is a good hyperparameter optimizer.

# XGBoost.cv selects the best number of estimators and gives a good cross validation scores.
# The 1st step is to find the best parameters using hyperopt and XGBoost.cv. Then, one uses those parameters to train a 
# classifier with all the data. However, an important difference between the use of XGBoost.cv and the final fitting is that
# with XGBoost.cv we split the data in 4/5 training set and 1/5 cross-validation set. Thus, the parameters may not be the best
# for the final training. It is particularly the case of n_estimators which has a use uncertainty.

import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from time import time
from random import randint

nrepeat = 20
repeat = range(nrepeat)

from sklearn.model_selection import KFold

dmatrix = xgb.DMatrix(x.values, y.values)

def objective(space):
    dmatrix = xgb.DMatrix(x.values, y.values)
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrix, num_boost_round=1000, nfold=5,
                metrics='error', early_stopping_rounds=10, shuffle = True, seed=seed)
        error += cvresult['test-error-mean'].tail(1).values[0]
    error /= nrepeat

    return{'loss':error, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform ('max_depth', 1, 14, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 1.),
        'learning_rate' : hp.uniform ('learning_rate', 0.01, 0.5),
       'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1.),
        'colsample_bylevel': hp.uniform ('colsample_bylevel', 0.5, 1.),
        'gamma': hp.uniform ('gamma', 0., 1.),
    'reg_alpha': hp.uniform ('reg_alpha', 0., 1.),
    'reg_lambda' : hp.uniform ('reg_lambda', 0., 1.)
    }

trials = Trials()
start_time = time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("--- %s seconds ---" % (time() - start_time))

best['max_depth'] = int(best['max_depth'])
print (best)

import numpy as np
start_time = time()
n_estimators = []
Error = []
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(best, dmatrix, num_boost_round=1000, nfold=5,
                metrics='error', early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
    Error.append(cvresult['test-error-mean'].tail(1).values[0])
print("--- %s seconds ---" % (time() - start_time))
n_estimators_mean = np.mean(n_estimators)
n_estimators_std = np.std(n_estimators)
Error_mean = np.mean(Error)
Error_std = np.std(Error)
print(n_estimators_mean, n_estimators_std, Error_mean, Error_std)

best1 = best
best1['n_estimators'] = int(round(n_estimators_mean))
clf = xgb.XGBClassifier(**best1)
prediction1 = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction1)
result_df = result_df.set_index('PassengerId').astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic')

--- 522.3922550678253 seconds ---
{'colsample_bylevel': 0.666283305643945, 'colsample_bytree': 0.96025711198482, 'gamma': 0.4538320367589998, 'learning_rate': 0.1672241327029708, 'max_depth': 10, 'min_child_weight': 1.0, 'reg_alpha': 0.22779378353622617, 'reg_lambda': 0.6926529168023405, 'subsample': 0.9995550469608545}
--- 102.16380286216736 seconds ---
11.4 12.7121988657 0.148246962 0.00511404120458
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 0
911                 0
912                 0
913                 0
914                 1
915                 0
916                 1
9

In [19]:
# Try again with logloss rather than accuracy
evalmetrics = 'logloss'
nrepeat = 20
repeat = range(nrepeat)

from sklearn.model_selection import KFold

dmatrix = xgb.DMatrix(x.values, y.values)

def objective(space):
    dmatrix = xgb.DMatrix(x.values, y.values)
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=10, shuffle = True, seed=seed)
        error += cvresult['test-logloss-mean'].tail(1).values[0]
    error /= nrepeat

    return{'loss':1-error, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform ('max_depth', 1, 14, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 1.),
        'learning_rate' : hp.uniform ('learning_rate', 0.01, 0.5),
       'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1.),
        'colsample_bylevel': hp.uniform ('colsample_bylevel', 0.5, 1.),
        'gamma': hp.uniform ('gamma', 0., 1.),
    'reg_alpha': hp.uniform ('reg_alpha', 0., 1.),
    'reg_lambda' : hp.uniform ('reg_lambda', 0., 1.)
    }

trials = Trials()
start_time = time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("--- %s seconds ---" % (time() - start_time))

best['max_depth'] = int(best['max_depth'])
print (best)

import numpy as np
start_time = time()
n_estimators = []
Error = []
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(best, dmatrix, num_boost_round=1000, nfold=5,
                metrics='logloss', early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
    Error.append(cvresult['test-logloss-mean'].tail(1).values[0])
print("--- %s seconds ---" % (time() - start_time))
n_estimators_mean = np.mean(n_estimators)
n_estimators_std = np.std(n_estimators)
Error_mean = np.mean(Error)
Error_std = np.std(Error)
print(n_estimators_mean, n_estimators_std, Error_mean, Error_std)

best2 = best
best2['n_estimators'] = int(round(n_estimators_mean))
clf = xgb.XGBClassifier(**best2)
prediction2 = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction2)
result_df = result_df.set_index('PassengerId').astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic2')

--- 541.779257774353 seconds ---
{'colsample_bylevel': 0.8793854140464347, 'colsample_bytree': 0.5284021992184958, 'gamma': 0.4877440792405143, 'learning_rate': 0.44616929141410355, 'max_depth': 4, 'min_child_weight': 5.0, 'reg_alpha': 0.40433603353443726, 'reg_lambda': 0.2570910174307357, 'subsample': 0.10055864508343322}
--- 35.055745363235474 seconds ---
3.59 0.939095309327 0.368803328667 0.156626686268
             Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 1
911                 1
912                 0
913                 0
914                 1
915                 0
916               

In [21]:
# Try again with AUC
evalmetrics = 'auc'
nrepeat = 20
repeat = range(nrepeat)

from sklearn.model_selection import KFold

dmatrix = xgb.DMatrix(x.values, y.values)

def objective(space):
    dmatrix = xgb.DMatrix(x.values, y.values)
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=10, shuffle = True, seed=seed)
        error += cvresult['test-auc-mean'].tail(1).values[0]
    error /= nrepeat

    return{'loss':1-error, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform ('max_depth', 1, 14, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 1.),
        'learning_rate' : hp.uniform ('learning_rate', 0.01, 0.5),
       'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1.),
        'colsample_bylevel': hp.uniform ('colsample_bylevel', 0.5, 1.),
        'gamma': hp.uniform ('gamma', 0., 1.),
    'reg_alpha': hp.uniform ('reg_alpha', 0., 1.),
    'reg_lambda' : hp.uniform ('reg_lambda', 0., 1.)
    }

trials = Trials()
start_time = time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("--- %s seconds ---" % (time() - start_time))

best['max_depth'] = int(best['max_depth'])
print (best)

import numpy as np
start_time = time()
n_estimators = []
Error = []
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(best, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
    Error.append(cvresult['test-auc-mean'].tail(1).values[0])
print("--- %s seconds ---" % (time() - start_time))
n_estimators_mean = np.mean(n_estimators)
n_estimators_std = np.std(n_estimators)
Error_mean = np.mean(Error)
Error_std = np.std(Error)
print(n_estimators_mean, n_estimators_std, Error_mean, Error_std)

best3 = best
best3['n_estimators'] = int(round(n_estimators_mean))
clf = xgb.XGBClassifier(**best3)
prediction3 = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction3)
result_df = result_df.set_index('PassengerId').astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic3')

--- 633.4523627758026 seconds ---
{'colsample_bylevel': 0.8227348764961984, 'colsample_bytree': 0.6545074153813044, 'gamma': 0.2883553402434034, 'learning_rate': 0.09387314161078095, 'max_depth': 5, 'min_child_weight': 2.0, 'reg_alpha': 0.17679068520207764, 'reg_lambda': 0.7885048250495174, 'subsample': 0.8616149029297578}
--- 76.12603735923767 seconds ---
30.14 38.6872123576 0.50046164 0.265336393349
             Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 0
911                 1
912                 0
913                 0
914                 1
915                 0
916                 1
9

In [27]:
# Compare predictions

prediction1 = pd.read_csv('Result_Titanic')
prediction2 = pd.read_csv('Result_Titanic2')
prediction3 = pd.read_csv('Result_Titanic3')

# The output is the number of different result
print((prediction1 - prediction2).abs().sum()['Survived'])
print((prediction1 - prediction3).abs().sum()['Survived'])
print((prediction2 - prediction3).abs().sum()['Survived'])

# I got 0.79904 for 'Result_Titanic', that is 334/418
# I got 0.7751 for 'Result_Titanic2', that is 324/418    
    # Thus I got 10 less in 40 different, that means 15 good changes and 25 bad changes
# I got 0.80383, for 'Result_Titanic3', that is 336/418
    # Thus I got 2 more in 16 different, that means 9 good changes and 7 bad changes
    # Thus I got 12 more in 32 different, that means 22 good changes and 10 bad changes
    
# AUC is slightly better than accuracy, but it might just be luck. LogLoss is significantly less good.

40
16
32


In [49]:
# Try AUC with more nestimators     We have n_estimators_mean=30.14 n_estimators_std=38.6872123576 
# It might be better to be more precise and to go higher than the mean since the standard deviation is so large, thus we don't
# know if the mean is close to the good number for the final training. One might hope that more estimators is better than less
# and we have all the data for the final training while only 4/5 for the cv, so it is normal to believe that more estimators
# could be good.

evalmetrics = 'auc'
start_time = time()
best30 = best3
if 'n_estimators' in best30: del best30['n_estimators']
n_estimators = []
Error = []
for i in range(1000):
    seed = randint(0, 100000)
    cvresult = xgb.cv(best, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
    Error.append(cvresult['test-auc-mean'].tail(1).values[0])
print("--- %s seconds ---" % (time() - start_time))
n_estimators_mean = np.mean(n_estimators)
n_estimators_std = np.std(n_estimators)
Error_mean = np.mean(Error)
Error_std = np.std(Error)
print(n_estimators_mean, n_estimators_std, Error_mean, Error_std)
# Somehow, n_estimators_std do not decrease when range(1000) increases. That means it's not gaussian.

best30['n_estimators'] = int(round(n_estimators_mean))
clf = xgb.XGBClassifier(**best30)
prediction30 = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction30)
result_df = result_df.set_index('PassengerId').astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic30')
# I got 0.80383


# Try with more n_estimators
best31 = best3
best31['n_estimators'] = int(round(n_estimators_mean + n_estimators_std))
clf = xgb.XGBClassifier(**best31)
prediction31 = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction31)
result_df = result_df.set_index('PassengerId').astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic31')
# I got 0.78947

# It seems it's not the case.

--- 909.685076713562 seconds ---
38.553 43.7054595102 0.8954749328 0.0038582568371
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 0
911                 1
912                 0
913                 0
914                 1
915                 0
916                 1
917                 0
918                 1
919                 0
920                 0
921                 0
...               ...
1280                0
1281                0
1282                0
1283                1
1284                1
1285                0
1286                0
1287                1
1288           

In [29]:
help(clf.fit)

Help on method fit in module xgboost.sklearn:

fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True) method of xgboost.sklearn.XGBClassifier instance
    Fit gradient boosting classifier
    
    Parameters
    ----------
    X : array_like
        Feature matrix
    y : array_like
        Labels
    sample_weight : array_like
        Weight for each instance
    eval_set : list, optional
        A list of (X, y) pairs to use as a validation set for
        early-stopping
    eval_metric : str, callable, optional
        If a str, should be a built-in evaluation metric to use. See
        doc/parameter.md. If callable, a custom evaluation metric. The call
        signature is func(y_predicted, y_true) where y_true will be a
        DMatrix object such that you may need to call the get_label
        method. It must return a str, value pair where the str is a name
        for the evaluation and value is the value of the evaluation
      

In [38]:
# Maybe always use early stopping
# It has however the bad effect to make that we can't use all the data.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
eval_set=[(X_test, y_test)]
bestES=best3 # Use the best set of parameter
bestES['n_estimators'] = 1000 # Need to be large so the stopping is by early stopping
clf = xgb.XGBClassifier(**bestES)
clf.fit(X_train,y_train, eval_set=eval_set,early_stopping_rounds=50, eval_metric='auc', verbose=False)
prediction4 = clf.predict(test_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction4)
result_df = result_df.set_index('PassengerId').astype(int)

result_df.to_csv(path_or_buf='Result_Titanic4')

prediction4 = pd.read_csv('Result_Titanic4')
print((prediction4 - prediction3).abs().sum()['Survived'])

# I got 0.78469

12


In [10]:
# Try to separate first between male and female since they are very different samples.

import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from time import time
from random import randint
import numpy as np

trainM_df = train_df[train_df['Sex']==0]
trainF_df = train_df[train_df['Sex']==1]
testM_df = test_df[test_df['Sex']==0]
testF_df = test_df[test_df['Sex']==1]

xM = trainM_df.drop('Survived',axis=1)
yM = trainM_df['Survived']
xF= trainF_df.drop('Survived',axis=1)
yF = trainF_df['Survived']

dmatrixM = xgb.DMatrix(xM.values, yM.values)
dmatrixF = xgb.DMatrix(xF.values, yF.values)

evalmetrics = 'auc'
nrepeat = 20
repeat = range(nrepeat)
def objectiveM(space):
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrixM, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=10, shuffle = True, seed=seed)
        error += cvresult['test-auc-mean'].tail(1).values[0]
    error /= nrepeat

    return{'loss':1-error, 'status': STATUS_OK }
def objectiveF(space):
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrixF, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=10, shuffle = True, seed=seed)
        error += cvresult['test-auc-mean'].tail(1).values[0]
    error /= nrepeat

    return{'loss':1-error, 'status': STATUS_OK }

space ={
        'max_depth': hp.quniform ('max_depth', 1, 14, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 1.),
        'learning_rate' : hp.uniform ('learning_rate', 0.01, 0.5),
       'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1.),
        'colsample_bylevel': hp.uniform ('colsample_bylevel', 0.5, 1.),
        'gamma': hp.uniform ('gamma', 0., 1.),
    'reg_alpha': hp.uniform ('reg_alpha', 0., 1.),
    'reg_lambda' : hp.uniform ('reg_lambda', 0., 1.)
    }

trials = Trials()
start_time = time()
bestM = fmin(fn=objectiveM,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
bestF = fmin(fn=objectiveF,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("--- %s seconds ---" % (time() - start_time))

bestM['max_depth'] = int(bestM['max_depth'])
bestF['max_depth'] = int(bestF['max_depth'])
print (bestM)
print (bestF)

start_time = time()
n_estimators = []
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(bestM, dmatrixM, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
n_estimators_meanM = np.mean(n_estimators)
    
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(bestF, dmatrixF, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimators.append(cvresult.shape[0])
n_estimators_meanF = np.mean(n_estimators)
    
print("--- %s seconds ---" % (time() - start_time))

bestM['n_estimators'] = int(round(n_estimators_meanM))
bestF['n_estimators'] = int(round(n_estimators_meanF))
clf = xgb.XGBClassifier(**bestM)
predictionM = clf.fit(xM,yM).predict(testM_df.drop('Survived',axis=1))
clf = xgb.XGBClassifier(**bestF)
predictionF = clf.fit(xF,yF).predict(testF_df.drop('Survived',axis=1))

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = result_df['Sex']
result_df['Survived'].loc[result_df['Sex']=='male'] = predictionM
result_df['Survived'].loc[result_df['Sex']=='female'] = predictionF
result_df = result_df.set_index('PassengerId').drop('Sex',axis=1).astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic6')

--- 412.9174773693085 seconds ---
{'colsample_bylevel': 0.9235200762181645, 'colsample_bytree': 0.822453160665294, 'gamma': 0.7791936249481772, 'learning_rate': 0.1955450438296506, 'max_depth': 6, 'min_child_weight': 1.0, 'reg_alpha': 0.7308280629929902, 'reg_lambda': 0.4987932979468526, 'subsample': 0.8200814329518703}
{'colsample_bylevel': 0.9235200762181645, 'colsample_bytree': 0.822453160665294, 'gamma': 0.7791936249481772, 'learning_rate': 0.1955450438296506, 'max_depth': 6, 'min_child_weight': 1.0, 'reg_alpha': 0.7308280629929902, 'reg_lambda': 0.4987932979468526, 'subsample': 0.8200814329518703}
--- 155.65548515319824 seconds ---


IndexingError: Unalignable boolean Series key provided

In [34]:
result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = result_df['Sex']
result_df['Survived'].loc[result_df['Sex']=='male'] = predictionM
result_df['Survived'].loc[result_df['Sex']=='female'] = predictionF
result_df = result_df.set_index('PassengerId').drop('Sex',axis=1).astype(int)
print(result_df)

result_df.to_csv(path_or_buf='Result_Titanic6')

# I got 0.79426
# So it wasn't a good idea.

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 0
911                 0
912                 0
913                 0
914                 1
915                 0
916                 1
917                 0
918                 1
919                 0
920                 1
921                 0
...               ...
1280                0
1281                0
1282                0
1283                1
1284                1
1285                0
1286                0
1287                1
1288                0
1289                1
1290                0
1291                0
1292      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [70]:
# Add Cabin features

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine_df = train_df.append(test_df)

combine_df = combine_df.join(combine_df.groupby(by='Ticket').size().rename('Ticket_count'),on='Ticket')
combine_df['FarePerPerson']=combine_df['Fare']/combine_df['Ticket_count']
combine_df['Family'] = combine_df['SibSp'] + combine_df['Parch']
combine_df['Sex'] = combine_df['Sex'].map( {'female': 1, 'male': 0} )
# PClass is not a real number, let's transform it into dummies.
combine_df['Pclass'] = combine_df['Pclass'].replace({1 : '1st',2 : '2nd',3 : '3rd'})

# Create title row.
def fun0(x):
    tmp = (x.split(',')[1]).split()[0]
    if tmp in ['Mr.','Master.','Dr.','Rev.']:
        return tmp
    elif tmp in ['Miss.', 'Mlle.', 'Ms.']:
        return 'Miss.'
    elif tmp in ['Mrs.', 'Mme.']:
        return 'Mrs.'
    else:
        return 'Other'
combine_df['title'] = combine_df['Name'].apply(fun0)

# Create same ticket survival
tmp1 = (combine_df.groupby(by='Ticket').count())['Survived']
tmp2 = (combine_df.groupby(by='Ticket').sum())['Survived']
combine_df['tmp1'] = combine_df['Ticket'].replace(to_replace=tmp1)   # Number of non_nan survived with same ticket
combine_df['tmp2'] = combine_df['Ticket'].replace(to_replace=tmp2)   # Sum of non_nan survived with same ticket

def f(row):
    if np.isnan(row['Survived']): # test
        if row['tmp1'] == 0 :
            val = np.nan
        else:
            val = row['tmp2'] /row['tmp1']
    else: # train
        if row['tmp1'] == 1 :
            val = np.nan
        else:
            val = (row['tmp2']-row['Survived']) / (row['tmp1']-1)
    return val
combine_df['SameTicketSurvival'] = combine_df.apply(f, axis=1)
combine_df = combine_df.drop('tmp1',axis=1).drop('tmp2',axis=1)

# Create same last name survival
combine_df['LastName'] = combine_df['Name'].apply(lambda x : (x.split(',')[0]))
tmp1 = (combine_df.groupby(by='LastName').count())['Survived']
tmp2 = (combine_df.groupby(by='LastName').sum())['Survived']
combine_df['tmp1'] = combine_df['LastName'].replace(to_replace=tmp1)   # Number of non_nan survived with same ticket
combine_df['tmp2'] = combine_df['LastName'].replace(to_replace=tmp2)   # Sum of non_nan survived with same ticket
combine_df['LastNameSurvival'] = combine_df.apply(f, axis=1)
combine_df = (combine_df.drop('tmp1',axis=1).drop('tmp2',axis=1).drop('PassengerId',axis=1).drop('Name',axis=1))

# Use only first cabin
combine_df['Cabin'] = combine_df['Cabin'].apply(lambda x: x.split()[0] if x == x else x)

# Fill cabin with last name
for s in combine_df.groupby(by='Ticket')['Cabin']:
    s1=s[1]  # s[0] is the Last Name
    s1.fillna(method='ffill', inplace=True)
    s1.fillna(method='bfill', inplace=True)
    
# Fill cabin with ticket
for s in combine_df.groupby(by='Ticket')['Cabin']:
    s1=s[1]  # s[0] is the ticket number
    s1.fillna(method='ffill', inplace=True)
    s1.fillna(method='bfill', inplace=True)
    
# Split cabin into letter and integer
combine_df['CabinLetter'] = combine_df['Cabin'].apply(lambda x: x[0] if x == x else x)
def tmp(x):
    if x != x :
        return x
    elif len(x)>1:
        return float(x[1:])
    else:
        return np.nan
combine_df['CabinNumber'] = combine_df['Cabin'].apply(tmp)

# Drop features with too many categories
combine_df = combine_df.drop('LastName',axis=1).drop('Ticket',axis=1).drop('Cabin',axis=1)
    
# Create dummies for every string variable
combine_df = pd.get_dummies(combine_df)

# Split Other between gender
combine_df['OtherFemale'] = combine_df['title_Other'] * combine_df['Sex'] + combine_df['title_Dr.'] * combine_df['Sex']
combine_df['OtherMale'] = combine_df['title_Other'] - combine_df['OtherFemale']

# Split classes between gender
combine_df['1F'] = combine_df['Pclass_1st'] * combine_df['Sex']
combine_df['2F'] = combine_df['Pclass_2nd'] * combine_df['Sex']
combine_df['3F'] = combine_df['Pclass_3rd'] * combine_df['Sex']
combine_df['1M'] = combine_df['Pclass_1st'] - combine_df['1F']
combine_df['2M'] = combine_df['Pclass_2nd'] - combine_df['2F']
combine_df['3M'] = combine_df['Pclass_3rd'] - combine_df['3F']

# Separate in train and test
train_df = combine_df[pd.notnull(combine_df['Survived'])]
test_df = combine_df[False == pd.notnull(combine_df['Survived'])]
x = train_df.drop('Survived',axis=1)
y = train_df['Survived']

nrepeat = 20
repeat = range(nrepeat)

from sklearn.model_selection import KFold

dmatrix = xgb.DMatrix(x.values, y.values)

def objective(space):
    param={'learning_rate' : space['learning_rate'],
     'max_depth' : int(space['max_depth']),
     'min_child_weight' : space['min_child_weight'],
        'gamma' : space['gamma'],
     'subsample' : space['subsample'],
     'colsample_bytree' :space['colsample_bytree'],
     'nthread' : -1,
     'colsample_bylevel' :space['colsample_bylevel'],
        'reg_alpha' :space['reg_alpha'],
     'reg_lambda' :space['reg_lambda']  
          }
    error = 0
    for i in repeat:
        seed = randint(0, 100000)
        cvresult = xgb.cv(param, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=10, shuffle = True, seed=seed)
        if evalmetrics == 'auc':
            error += 1-cvresult['test-auc-mean'].tail(1).values[0]
        elif evalmetrics == 'error':
            error += cvresult['test-error-mean'].tail(1).values[0]
    error /= nrepeat

    return error


space ={
        'max_depth': hp.quniform ('max_depth', 1, 14, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 1.),
        'learning_rate' : hp.uniform ('learning_rate', 0.01, 0.5),
       'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1.),
        'colsample_bylevel': hp.uniform ('colsample_bylevel', 0.5, 1.),
        'gamma': hp.uniform ('gamma', 0., 1.),
    'reg_alpha': hp.uniform ('reg_alpha', 0., 1.),
    'reg_lambda' : hp.uniform ('reg_lambda', 0., 1.)
    }

start_time = time()
evalmetrics = 'auc'
bestAUC = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100)
evalmetrics = 'error'
bestError = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100)
print("--- %s seconds ---" % (time() - start_time))

bestAUC['max_depth'] = int(bestAUC['max_depth'])
print (bestAUC)
bestError['max_depth'] = int(bestError['max_depth'])
print (bestError)

import numpy as np
start_time = time()
n_estimatorsAUC = []
n_estimatorsError = []
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(bestAUC, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimatorsAUC.append(cvresult.shape[0])
for i in range(100):
    seed = randint(0, 100000)
    cvresult = xgb.cv(bestError, dmatrix, num_boost_round=1000, nfold=5,
                metrics=evalmetrics, early_stopping_rounds=50, shuffle = True, seed=seed)
    n_estimatorsError.append(cvresult.shape[0])
print("--- %s seconds ---" % (time() - start_time))
n_estimatorsAUC_mean = np.mean(n_estimatorsAUC)
n_estimatorsError_mean = np.mean(n_estimatorsError)
print(n_estimatorsAUC_mean, n_estimatorsError_mean)

bestAUC['n_estimators'] = int(round(n_estimatorsAUC_mean))
clf = xgb.XGBClassifier(**bestAUC)
predictionAUC = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))
result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=predictionAUC)
result_df = result_df.set_index('PassengerId').astype(int)
result_df.to_csv(path_or_buf='Result_TitanicAUC')

bestError['n_estimators'] = int(round(n_estimatorsError_mean))
clf = xgb.XGBClassifier(**bestError)
predictionError = clf.fit(x,y).predict(test_df.drop('Survived',axis=1))
result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=predictionError)
result_df = result_df.set_index('PassengerId').astype(int)
result_df.to_csv(path_or_buf='Result_TitanicError')

# Differences between AUC and Error
print((pd.Series(data=predictionAUC) - pd.Series(data=predictionError)).abs().sum())

# I got 0.77990 for AUC
# I got 0.79426 for Error

# Still less than initial answer.

--- 1369.4690580368042 seconds ---
{'colsample_bylevel': 0.822266637784425, 'colsample_bytree': 0.7688159874607244, 'gamma': 0.24141815239315229, 'learning_rate': 0.1519318808587794, 'max_depth': 5, 'min_child_weight': 8.0, 'reg_alpha': 0.8767988943536921, 'reg_lambda': 0.29774614494309326, 'subsample': 0.7056111963447809}
{'colsample_bylevel': 0.8762569525817361, 'colsample_bytree': 0.7754800031185707, 'gamma': 0.2571815526593193, 'learning_rate': 0.20786990229752467, 'max_depth': 6, 'min_child_weight': 5.0, 'reg_alpha': 0.5395288974302214, 'reg_lambda': 0.3170365728183162, 'subsample': 0.5669572593505213}
--- 9916.568351268768 seconds ---
35.21 32.69
17.0


In [82]:
# Stack classifier
# Rather than find the n_estimators_mean and then use it on all data. 
# I train 5 classifier with 20% data as CV to do early stopping. Then they vote.

bestES={'colsample_bylevel': 0.8227348764961984, 'colsample_bytree': 0.6545074153813044, 'gamma': 0.2883553402434034, 'learning_rate': 0.09387314161078095, 'max_depth': 5, 'min_child_weight': 2.0, 'reg_alpha': 0.17679068520207764, 'reg_lambda': 0.7885048250495174, 'subsample': 0.8616149029297578}
# Use the best set of parameter which is the first one with AUC
bestES['n_estimators'] = 1000 # Need to be large so the stopping is by early stopping

kf = KFold(n_splits=5)
prediction = np.zeros(len(test_df))
for train_index, cv_index in kf.split(x):
    eval_set=[(x.iloc[cv_index], y.iloc[cv_index])]
    clf = xgb.XGBClassifier(**bestES)
    clf.fit(x.iloc[train_index],y.iloc[train_index], eval_set=eval_set,early_stopping_rounds=50, eval_metric='error', verbose=False)
    prediction += np.array(clf.predict(test_df.drop('Survived',axis=1)))
prediction = (prediction/5).round().astype(int)

result_df = pd.read_csv('test.csv').drop('Pclass',axis=1).drop('Name',axis=1).drop('Sex',axis=1).drop('Age',axis=1).drop('SibSp',axis=1).drop('Parch',axis=1).drop('Ticket',axis=1).drop('Fare',axis=1).drop('Cabin',axis=1).drop('Embarked',axis=1)
result_df['Survived'] = pd.Series(data=prediction)
result_df = result_df.set_index('PassengerId').astype(int)
result_df.to_csv(path_or_buf='Result_Titanic7')

# I got 0.78947

In [None]:
# It seems I can't do better than my initial 0.80383.
# This result is good considering that there is no reason to believe that survival is perfectly determined by socio-economical
# status.
# There are a lot of people who got better results, but some of them, and certainly all the 100%, cheated. Which is easy since
# one can directly check on the net any name on the list to know their survival status.