In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import re
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import (KFold, train_test_split)
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

stdsc = StandardScaler()

In [5]:
def create_model_data(filename):
    titanic=pd.read_csv(filename,header=0,index_col=False)
    
    titanic=titanic.join(pd.get_dummies(titanic['Pclass'],drop_first=False))
    titanic.rename(columns={1:'1st',2:'2nd',3:'3rd'},inplace=True)
    
    #Age_Known column with 1 value if age was originally known and 0 if unknown
    age_known=titanic['Age'].isnull().tolist()
    for index in range(0,len(age_known)):
        if age_known[index]==False:
            age_known[index]=0
        else:
            age_known[index]=1
    titanic['Age_Known']=age_known
    
    #Fill Age column with Avg age as per Sex and PassengerClass
    fem_avg_age_1=titanic['Age'][(titanic['Sex']=='female') & (titanic['Pclass']==1)].mean()
    fem_avg_age_2=titanic['Age'][(titanic['Sex']=='female') & (titanic['Pclass']==2)].mean()
    fem_avg_age_3=titanic['Age'][(titanic['Sex']=='female') & (titanic['Pclass']==3)].mean()
    male_avg_age_3=titanic['Age'][(titanic['Sex']=='male') & (titanic['Pclass']==3)].mean()
    male_avg_age_2=titanic['Age'][(titanic['Sex']=='male') & (titanic['Pclass']==2)].mean()
    male_avg_age_1=titanic['Age'][(titanic['Sex']=='male') & (titanic['Pclass']==1)].mean()
    for index in list(titanic.index):
        if titanic.loc[index,'Sex']=='female' and titanic['Age'].isnull()[index]==True:
            if titanic.loc[index,'Pclass']==1:
                titanic.loc[index,'Age']=fem_avg_age_1
            if titanic.loc[index,'Pclass']==2:
                titanic.loc[index,'Age']=fem_avg_age_2
            if titanic.loc[index,'Pclass']==3:
                titanic.loc[index,'Age']=fem_avg_age_3
        elif titanic.loc[index,'Sex']=='male' and titanic['Age'].isnull()[index]==True:
            if titanic.loc[index,'Pclass']==3:
                titanic.loc[index,'Age']=male_avg_age_3
            if titanic.loc[index,'Pclass']==2:
                titanic.loc[index,'Age']=male_avg_age_2
            if titanic.loc[index,'Pclass']==1:
                titanic.loc[index,'Age']=male_avg_age_1

    #Create a column with 1 value if Passenger's cabin is known and 0 is unknown
    cabin_known=[]
    for index in list(titanic.index):
        if titanic['Cabin'].isnull()[index]==True:
            cabin_known.append(0)
        else:
            cabin_known.append(1)
    titanic['cabin_known']=cabin_known
    titanic.drop('Cabin',axis=1,inplace=True)
    
    #Split the Sex column into the specific columns for Males and Females
    titanic=titanic.join(pd.get_dummies(titanic['Sex'],drop_first=True))

    #Use the SiblingSpoouse and ParentsChildren columns to find the people with any immediate family on-board and those alone.
    fam_on_board=[]
    alone=[]
    for index in list(titanic.index):
        fam_on_board.append(titanic.loc[index,'SibSp']+titanic.loc[index,'Parch'])
        if (titanic.loc[index,'SibSp']+titanic.loc[index,'Parch'])!=0:
            alone.append(0)
        else:
            alone.append(1)
    titanic['Fam_on_board']=fam_on_board
    titanic['Alone']=alone

    #Calculate the average cost per ticket for every individual to get a better idea of their wealth.
    cost_per_ticketholder=[]
    for index in list(titanic.index):
        if titanic.loc[index,'Fam_on_board']==0:
            cost_per_ticketholder.append(titanic.loc[index,'Fare'])
        else:
            cost_per_ticketholder.append(titanic.loc[index,'Fare']/titanic.loc[index,'Fam_on_board'])
    titanic['Cost_per_ticketholder']=cost_per_ticketholder

    #Get a separate column for each title in an individual's name to check for any correlation with their wealth and their chances of survival.
    title=[]
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4}
    for name in titanic['Name'].tolist():
        short=name.split(',')[1].split('.')[0].replace(" ",'')
        try:
            title.append(title_mapping[short])
        except:
            title.append(5)
    titanic['Title']=title
    
    #Create the column 'Child' with entry as 1 if the age is less than 16 and as 0 if greater than 16 years. 
    Child=[]
    for index in list(titanic.index):
        if int(titanic.loc[index,'Age'])<16:
            Child.append(1)
        else:
            Child.append(0)
    titanic['Child']=Child
    
    #Split the embarked column into the specific columns for the 3 locations from where the passengers boarded the Titanic
    titanic['Embarked'].fillna('S',axis=0,inplace=True)
    titanic=titanic.join(pd.get_dummies(titanic['Embarked'],drop_first=False))
    titanic.drop(['Embarked','Q'],axis=1,inplace=True)
    
    #Drop the extra columns
    titanic.drop(['Pclass','Name','Sex','Ticket'],axis=1,inplace=True)
    
    return titanic

In [6]:
#Create Training and Testing Data

In [10]:
model_train=create_model_data('train.csv').drop(['PassengerId','Cost_per_ticketholder','Fam_on_board','1st','Child','SibSp','Parch', 'Age_Known','cabin_known'],axis=1)
model_train.columns

Index(['Survived', 'Age', 'Fare', '2nd', '3rd', 'male', 'Alone', 'Title', 'C',
       'S'],
      dtype='object')

In [None]:
model_test=create_model_data('test.csv')
passenger_ID=model_test['PassengerId'].tolist()
model_test.drop(['PassengerId','Cost_per_ticketholder','Fam_on_board','1st','Child','SibSp','Parch', 'Age_Known','cabin_known'],axis=1,inplace=True)
model_test.columns

In [None]:
train=model_train
test=model_test

In [8]:
ntrain=train.shape[0]
ntest=test.shape[0]
SEED=0
NFOLDS=5
kf=KFold(n_splits=NFOLDS,random_state=SEED)

#create a class to extend the Sklearn Regressors
class Sklearnhelper(object):
    def __init__(self,clf,seed=0,params=None):
        params['random_state']=seed
        self.clf=clf(**params)
    def train(self,x_train,y_train):
        self.clf.train(x_train,y_train)
    def predict(self, x):
        return self.clf.predict(x)
    def fit(self,x,y):
        return self.clf.fit(x,y)
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [9]:
## FUNCTION TO GET THE OUT OF FOLD PREDICTIONS WHICH ARE USED AS NEW FEATURES TO THE SECOND LEVEL PREDICTION ##

def get_oof(clf,x_train,y_train,x_test):
    oof_train=np.zeros((ntrain,))
    oof_test=np.zeros((ntest,))
    oof_test_skf=np.empty((NFOLDS,ntest))
    
    for i,(train_index,test_index) in enumerate(kf.split(x_train)):
        x_tr=x_train[train_index]
        x_te=x_train[test_index]
        y_tr=y_train[train_index]
        
        clf.fit(x_tr,y_tr)
        
        oof_train[test_index]=clf.predict(x_te)
        oof_test_skf[i, :]=clf.predict(x_test)
    oof_test[:]=oof_test_skf.mean(axis=0)
    
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)


In [28]:
#Declare the parameters that are used as input in the 5 regressors.
rf_params={'n_jobs':-1,'n_estimators':500,'warm_start':True,'max_depth':6,'min_samples_leaf':2,'max_features':'sqrt','verbose':0}
et_params={'n_jobs':-1,'n_estimators':500,'max_depth':6,'min_samples_leaf':2,'verbose':0}
ada_params={'n_estimators':500,'learning_rate':0.75}
gb_params={'n_estimators':500,'max_depth':5,'min_samples_leaf':2,'verbose':0}
svc_params={'kernel':'linear','C':0.025}

#Create the Regressor objects to represent our model
rf=Sklearnhelper(clf=RandomForestClassifier,seed=SEED,params=rf_params)
et=Sklearnhelper(clf=ExtraTreesClassifier,seed=SEED,params=et_params)
gb=Sklearnhelper(clf=GradientBoostingClassifier,seed=SEED,params=gb_params)
ada=Sklearnhelper(clf=AdaBoostClassifier,seed=SEED,params=ada_params)
svc=Sklearnhelper(clf=SVC,seed=SEED,params=svc_params)

#CREATE NUMPY ARRAYS TO BE USED AS INPUTS
y_train=train['Survived'].ravel()
x_train=train.drop('Survived',axis=1).values
x_test=test.values

# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test=get_oof(rf,x_train,y_train,x_test)
et_oof_train, et_oof_test=get_oof(et,x_train,y_train,x_test)
ada_oof_train, ada_oof_test=get_oof(ada,x_train,y_train,x_test)
gb_oof_train, gb_oof_test=get_oof(gb,x_train,y_train,x_test)
svc_oof_train, svc_oof_test=get_oof(svc,x_train,y_train,x_test)

In [29]:
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[0.11131913 0.19804143 0.02655306 0.11206939 0.2078013  0.02279361
 0.28333042 0.02220792 0.01588374]
[0.0321816  0.04769147 0.03279797 0.16096247 0.43943391 0.03213274
 0.20346789 0.02502365 0.02630831]
[0.16  0.724 0.014 0.034 0.014 0.008 0.03  0.004 0.012]
[0.17308484 0.30640741 0.01151856 0.08157408 0.02208819 0.0069698
 0.37886418 0.00477733 0.01471561]


In [259]:
#Copy-Paste the Feature Importances

rf_features=[0.09293744, 0.0916828,  0.01569767, 0.02648281, 0.05161642, 0.19368762,
 0.05686529, 0.01160524, 0.1740301,  0.28539461]
et_features=[0.13174811, 0.02616867, 0.01129091, 0.02579529, 0.09254832, 0.43862129,
 0.03836753, 0.02556649, 0.03009955, 0.17979385]
ada_features=[0.01,  0.184, 0.024, 0.016, 0.014, 0.014, 0.052, 0.008, 0.64,  0.038]
gb_features=[0.06535086, 0.15975638, 0.00579944, 0.01609369, 0.01991735, 0.01796385,
 0.08568087, 0.00328791, 0.24858379, 0.37756586]

In [260]:
#CREATE DATAFRAME OF THE FEATURE IMPORTANCES OF THE 4 MODELS

feature_df=pd.DataFrame({'features':list(train.drop('Survived',axis=1).columns),'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features})
feature_df['mean']=feature_df.mean(axis=1)
feature_df

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,features,mean
0,0.01,0.131748,0.065351,0.092937,Pclass,0.075009
1,0.184,0.026169,0.159756,0.091683,Age,0.115402
2,0.024,0.011291,0.005799,0.015698,Parch,0.014197
3,0.016,0.025795,0.016094,0.026483,Embarked,0.021093
4,0.014,0.092548,0.019917,0.051616,cabin_known,0.044521
5,0.014,0.438621,0.017964,0.193688,male,0.166068
6,0.052,0.038368,0.085681,0.056865,Fam_on_board,0.058228
7,0.008,0.025566,0.003288,0.011605,Alone,0.012115
8,0.64,0.0301,0.248584,0.17403,Cost_per_ticketholder,0.273178
9,0.038,0.179794,0.377566,0.285395,Title,0.220189


In [30]:
## Second Level Predictions ##
### USE OF THE FIRST LEVEL OUTPUTS AS NEW FEATURES AND USE IT FOR THE SECOND LEVEL PREDICTIONS USING XGB Classifier##

base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })

#Making the New Training & Testing Sets
x_train=np.concatenate((et_oof_train,rf_oof_train,ada_oof_train,gb_oof_train,svc_oof_train),axis=1)
x_test=np.concatenate((et_oof_test,rf_oof_test,ada_oof_test,gb_oof_test,svc_oof_test),axis=1)

gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)

In [31]:
## Create Submission file ##

predictions=gbm.predict(x_test)
Submission=pd.DataFrame({ 'PassengerId': passenger_ID,'Survived': predictions})
Submission.to_csv("Submission.csv", index=False)