In [1]:
import numpy as np
import pandas as pd 
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
test.shape

(418, 11)

In [5]:
train.shape

(891, 12)

* Feature engg
* Missing values
* One Hot encoding

In [6]:
combined = pd.concat([train,test],ignore_index=True)

In [7]:
combined.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
combined.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [9]:
combined.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [10]:
cabins = ['C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',
       'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',
       'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',
       'C148', 'B45', 'B36', 'A21', 'D34', 'A9', 'C31', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C55 C57', 'C116', 'F', 'A29', 'C6', 'C28',
       'C51', 'C97', 'D22', 'B10', 'E45', 'E52', 'A11', 'B11', 'C80',
       'C89', 'F E46', 'B26', 'F E57', 'A18', 'E60', 'E39 E41',
       'B52 B54 B56', 'C39', 'B24', 'D40', 'D38', 'C105']

In [11]:
def cabin_labels(x):
    if x in cabins:
        return('Cabin_Avbl')
    else:
        return('Missing')

In [12]:
combined['Cabin_cat'] = combined.Cabin.apply(cabin_labels)

In [13]:
# pd.crosstab(combined.Survived,combined.Cabin_cat).plot(kind='bar')

In [14]:
# Name:
titles = []
for i in combined.Name:
    titles.append(i.split(",")[1].split(".")[0])

In [15]:
combined['Titles'] = pd.Series(titles)

In [16]:
combined.Titles.unique()

array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
       ' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer', ' Dona'], dtype=object)

In [17]:
titles_ignore = [' Don', ' Rev', ' Dr', ' Mme',
                ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer', ' Dona']

In [18]:
def notitle(x):
    if x in titles_ignore:
        return('others')
    else:
        return(x)

In [19]:
combined['Titles']=combined.Titles.apply(notitle)

In [20]:
combined.drop(['PassengerId','Name','Cabin','Ticket'],axis=1,inplace=True)

In [21]:
combined['Family'] = combined.SibSp+combined.Parch+1

In [22]:
def parivar(x):
    if x==1:
        return('Solo')
    elif x==2:
        return('Duo')
    elif x<=4:
        return('Small')
    else:
        return('Large')

In [23]:
combined['Family_cat'] = combined['Family'].apply(parivar)

In [24]:
# pd.crosstab(combined.Family_cat,combined.Survived)

In [25]:
# Missing values

In [26]:
combined.groupby('Titles')['Age'].unique()

Titles
 Master    [2.0, 7.0, 11.0, 4.0, nan, 0.83, 12.0, 1.0, 9....
 Miss      [26.0, 4.0, 58.0, 14.0, 15.0, 8.0, nan, 18.0, ...
 Mr        [22.0, 35.0, nan, 54.0, 20.0, 39.0, 34.0, 28.0...
 Mrs       [38.0, 35.0, 27.0, 14.0, 55.0, 31.0, nan, 40.0...
 Ms                                              [28.0, nan]
others     [40.0, 42.0, 51.0, 44.0, 54.0, 24.0, 23.0, 52....
Name: Age, dtype: object

In [27]:
missing_titles = combined.loc[combined.Age.isnull(),'Titles'].unique()

In [28]:
for i in missing_titles:
    combined.loc[combined.Age.isnull(),'Age'] =\
    combined.loc[combined.Titles==i,'Age'].median()

In [29]:
combined.loc[combined.Embarked.isnull(),'Embarked']=combined.Embarked.mode()[0]

In [30]:
combined.loc[combined.Fare.isnull(),'Fare']=combined.Fare.median()

In [31]:
# Split the data back in train and test

newtrain = combined.loc[0:train.shape[0]-1,]

newtest = combined.loc[train.shape[0]:,]

newtest.shape  ,newtrain.shape

((418, 12), (891, 12))

In [32]:
x = newtrain.drop(['Survived'],axis=1)
y = newtrain.Survived.astype(int)

newtest.drop(['Survived'],axis=1,inplace=True)

### Model Building


In [33]:
submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':0})
submission.to_csv('basemodel.csv',index=False)

In [34]:
# dummyfy the data
newx = pd.get_dummies(x,drop_first=True)
newtest1 = pd.get_dummies(newtest,drop_first=True)

In [35]:
from sklearn.linear_model import LogisticRegression

lg =LogisticRegression()
pred = lg.fit(newx,y).predict(newtest1)

In [36]:
submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':pred})
submission.to_csv('logistic.csv',index=False)

In [37]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [38]:
vif = []
for i in range(newx.shape[1]):
    vif.append(variance_inflation_factor(newx.values,i))
pd.DataFrame({'columns':newx.columns,'vif':vif})

Unnamed: 0,columns,vif
0,Pclass,2.887608
1,Age,1.564016
2,SibSp,386.127423
3,Parch,208.163779
4,Fare,1.716397
5,Family,1541.844876
6,Sex_male,45.570269
7,Embarked_Q,1.546592
8,Embarked_S,1.49512
9,Cabin_cat_Missing,2.228914


In [39]:
import statsmodels.api as sma

In [40]:
model = sma.Logit(y,newx).fit()
model.summary()

         Current function value: 0.397365
         Iterations: 35




0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,17.0
Date:,"Fri, 19 Jan 2024",Pseudo R-squ.:,0.4033
Time:,09:46:44,Log-Likelihood:,-354.05
converged:,False,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,6.198e-91

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pclass,-0.8026,0.187,-4.292,0.000,-1.169,-0.436
Age,-0.0234,0.009,-2.505,0.012,-0.042,-0.005
SibSp,-30.2309,1.25e+05,-0.000,1.000,-2.46e+05,2.46e+05
Parch,-30.1232,1.25e+05,-0.000,1.000,-2.46e+05,2.46e+05
Fare,0.0034,0.003,1.314,0.189,-0.002,0.009
Family,30.1511,1.25e+05,0.000,1.000,-2.46e+05,2.46e+05
Sex_male,-25.4061,1.25e+05,-0.000,1.000,-2.46e+05,2.46e+05
Embarked_Q,0.0330,0.401,0.082,0.935,-0.754,0.819
Embarked_S,-0.3189,0.254,-1.256,0.209,-0.816,0.179


In [41]:
newx.dtypes

Pclass                 int64
Age                  float64
SibSp                  int64
Parch                  int64
Fare                 float64
Family                 int64
Sex_male               uint8
Embarked_Q             uint8
Embarked_S             uint8
Cabin_cat_Missing      uint8
Titles_ Miss           uint8
Titles_ Mr             uint8
Titles_ Mrs            uint8
Titles_ Ms             uint8
Titles_others          uint8
Family_cat_Large       uint8
Family_cat_Small       uint8
Family_cat_Solo        uint8
dtype: object

p value is high i.e features are insignificant
lot of multicollinearity so remove it.

In [42]:
subset = newx.drop(columns=['Parch','SibSp','Sex_male'],axis=0)


In [43]:
vif = []
for i in range(newx.shape[1]):
    vif.append(variance_inflation_factor(newx.values,i))
pd.DataFrame({'columns':newx.columns,'vif':vif}).sort_values(by='vif',ascending=False)

Unnamed: 0,columns,vif
5,Family,1541.844876
2,SibSp,386.127423
3,Parch,208.163779
6,Sex_male,45.570269
10,Titles_ Miss,38.104557
12,Titles_ Mrs,29.624656
11,Titles_ Mr,8.477798
15,Family_cat_Large,8.368818
17,Family_cat_Solo,3.209859
16,Family_cat_Small,2.982278


In [44]:
model = sma.Logit(y,subset).fit()
model.summary()

         Current function value: 0.421190
         Iterations: 35




0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,876.0
Method:,MLE,Df Model:,14.0
Date:,"Fri, 19 Jan 2024",Pseudo R-squ.:,0.3675
Time:,09:46:47,Log-Likelihood:,-375.28
converged:,False,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,3.084e-84

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pclass,-0.2969,0.168,-1.773,0.076,-0.625,0.031
Age,-0.0131,0.009,-1.504,0.133,-0.030,0.004
Fare,0.0097,0.003,3.240,0.001,0.004,0.016
Family,0.5728,0.170,3.372,0.001,0.240,0.906
Embarked_Q,0.1156,0.394,0.293,0.769,-0.657,0.888
Embarked_S,-0.0706,0.241,-0.293,0.769,-0.542,0.401
Cabin_cat_Missing,-0.7748,0.314,-2.468,0.014,-1.390,-0.159
Titles_ Miss,0.8299,0.449,1.847,0.065,-0.051,1.711
Titles_ Mr,-1.9628,0.468,-4.195,0.000,-2.880,-1.046


#### Now remove the high pvalues

In [45]:
feats = []

for i in model.pvalues:
    if i < 0.05:
        feats.append(model.pvalues.index)
        

In [46]:
localdf = pd.DataFrame(model.pvalues,columns=['Pvalues']).reset_index()

In [47]:
feats = list(localdf.loc[localdf['Pvalues']<0.05,'index'])

In [48]:
feats

['Fare',
 'Family',
 'Cabin_cat_Missing',
 'Titles_ Mr',
 'Titles_ Mrs',
 'Family_cat_Large',
 'Family_cat_Solo']

## Modelling:

### Model of important features

#### Logistic Regression:

In [49]:
newfeats = subset.loc[:,feats]

model = sma.Logit(y,newfeats).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.434542
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,884.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 19 Jan 2024",Pseudo R-squ.:,0.3474
Time:,09:46:51,Log-Likelihood:,-387.18
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,6.329e-86

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Fare,0.0109,0.003,4.200,0.000,0.006,0.016
Family,0.4258,0.098,4.363,0.000,0.235,0.617
Cabin_cat_Missing,-0.9245,0.232,-3.977,0.000,-1.380,-0.469
Titles_ Mr,-2.5722,0.203,-12.698,0.000,-2.969,-2.175
Titles_ Mrs,0.8667,0.293,2.958,0.003,0.292,1.441
Family_cat_Large,-4.4873,0.639,-7.018,0.000,-5.740,-3.234
Family_cat_Solo,0.9603,0.219,4.387,0.000,0.531,1.389


In [50]:
model.params

Fare                 0.010875
Family               0.425778
Cabin_cat_Missing   -0.924455
Titles_ Mr          -2.572236
Titles_ Mrs          0.866743
Family_cat_Large    -4.487282
Family_cat_Solo      0.960324
dtype: float64

In [51]:
newtest1.loc[:, feats].head(1)

Unnamed: 0,Fare,Family,Cabin_cat_Missing,Titles_ Mr,Titles_ Mrs,Family_cat_Large,Family_cat_Solo
891,7.8292,1,1,1,0,0,1


In [52]:
# rop = 1/(1+np.exp(-logit))   # probability
model.predict(newtest1.loc[:,feats].head(1))

891    0.116557
dtype: float64

In [53]:
# rop[891]

In [54]:
model.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,884.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 19 Jan 2024",Pseudo R-squ.:,0.3474
Time:,09:46:54,Log-Likelihood:,-387.18
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,6.329e-86

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Fare,0.0109,0.003,4.200,0.000,0.006,0.016
Family,0.4258,0.098,4.363,0.000,0.235,0.617
Cabin_cat_Missing,-0.9245,0.232,-3.977,0.000,-1.380,-0.469
Titles_ Mr,-2.5722,0.203,-12.698,0.000,-2.969,-2.175
Titles_ Mrs,0.8667,0.293,2.958,0.003,0.292,1.441
Family_cat_Large,-4.4873,0.639,-7.018,0.000,-5.740,-3.234
Family_cat_Solo,0.9603,0.219,4.387,0.000,0.531,1.389


In [55]:
np.exp(-2.5722)

0.0763673523042484

In [56]:
mcfadden = 1 - (model.llf/model.llnull)
print('Mcfadden R2:',mcfadden)

Mcfadden R2: 0.3474488787775447


In [57]:
L0 = np.exp(model.llnull)
L1 = np.exp(model.llf)

cox_snell = 1 - (L0/L1)**(2/newx.shape[0])

In [58]:
nagalkerke = cox_snell/(1-L0**(2/newx.shape[0]))

print('McFadden',mcfadden)
print('Cox-Snell',cox_snell)
print('Nagalkerke',nagalkerke)

McFadden 0.3474488787775447
Cox-Snell 0.37044416823032167
Nagalkerke 0.5033176723723485


In [59]:
aic = (2*(newfeats.shape[1])) - (2*(model.llf)) # Akaike information criteria...

In [60]:
model.aic

788.3531401238849

In [61]:
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier,GradientBoostingClassifier,StackingClassifier,VotingClassifier
from sklearn.metrics import classification_report,cohen_kappa_score,confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score,roc_curve

In [62]:
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [63]:
y.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [64]:
x = newtrain.drop(['Survived'],axis=1)
y = newtrain.Survived.astype(int)
x1 = pd.get_dummies(x,drop_first=True)

trainx , testx , trainy, testy = train_test_split(x1,y,train_size=0.80,random_state=10,stratify=y)

In [65]:
from scipy.stats import chi2_contingency
cat_cols = x.drop(columns=['Age','Fare'])
for column in cat_cols:
    contingency_table = pd.crosstab(x[column], y)
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    print(f"Chi-square test for {column}: p-value = {p_value}")

Chi-square test for Pclass: p-value = 4.549251711298793e-23
Chi-square test for Sex: p-value = 1.1973570627755645e-58
Chi-square test for SibSp: p-value = 1.5585810465902147e-06
Chi-square test for Parch: p-value = 9.703526421039997e-05
Chi-square test for Embarked: p-value = 2.3008626481449577e-06
Chi-square test for Cabin_cat: p-value = 6.7419704360811776e-21
Chi-square test for Titles: p-value = 1.9783487591671835e-59
Chi-square test for Family: p-value = 3.579668975443533e-14
Chi-square test for Family_cat: p-value = 2.747307908074899e-16


In [66]:
from scipy.stats import ttest_ind
num_cols = ['Age','Fare']
for column in num_cols:
    group_0 = x[y == 0][column]
    group_1 = x[y == 1][column]
    
    t_statistic, p_value = ttest_ind(group_0, group_1)
    print(f"t-test for {column}: p-value = {p_value}")

t-test for Age: p-value = 0.04300040105433479
t-test for Fare: p-value = 6.120189341924198e-15


In [67]:
perf_table = pd.DataFrame(columns=['Model','Test_Data','Accuracy','Precision','Recall','F1 Score','Cohen Kappa Score',])

def model_perf(name,data,model,trainx,trainy,testx,testy):
    m = model
    m.fit(trainx,trainy)
    ypred = m.predict(testx)
    a = cohen_kappa_score(testy,ypred)
    b = accuracy_score(testy,ypred)
    c = recall_score(testy,ypred)
    d = precision_score(testy,ypred)
    e = f1_score(testy,ypred)
    global perf_table
    perf_table = perf_table.append({'Model':name,
                                    'Test_Data':data,
                                    'Accuracy':b,
                                    'Recall':c,
                                    'Precision':d,
                                    'F1 Score':e,
                                    'Cohen Kappa Score':a},
                                   ignore_index = True)

In [68]:
model_perf(name='Decision Tree',data='Train',model=DecisionTreeClassifier(max_depth=5),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [69]:
model_perf(name='Decision Tree',data='Test',model=DecisionTreeClassifier(max_depth=5),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [70]:
model_perf(name='Naive Bayes',data='Train',model=GaussianNB(),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [71]:
model_perf(name='Naive Bayes',data='Test',model=GaussianNB(),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [72]:
model_perf(name='Random Forest',data='Train',model=RandomForestClassifier(n_estimators=200,max_depth=7),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [73]:
model_perf(name='Random Forest',data='Test',model=RandomForestClassifier(n_estimators=200,max_depth=7),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [74]:
model_perf(name='Adaboost',data='Train',model=AdaBoostClassifier(n_estimators=200),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [75]:
model_perf(name='Adaboost',data='Test',model=AdaBoostClassifier(n_estimators=200),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [76]:
model_perf(name='Gradient Boosting',data='Train',model=GradientBoostingClassifier(n_estimators=200,max_depth=5),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [77]:
model_perf(name='Gradient Boosting',data='Test',model=GradientBoostingClassifier(n_estimators=200,max_depth=5),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [78]:
model_perf(name='Extreme Gradient Boosting',data='Train',model=XGBClassifier(n_estimators=300,max_depth=5),
          trainx=trainx,trainy=trainy,testx=trainx,testy=trainy)

In [79]:
model_perf(name='Extreme Gradient Boosting',data='Test',model=XGBClassifier(n_estimators=300,max_depth=5),
          trainx=trainx,trainy=trainy,testx=testx,testy=testy)

In [81]:
base_estimators = [('Logistic Regression',LogisticRegression()),('Naive Bayes',GaussianNB()),
                  ('Decision Tree',DecisionTreeClassifier(max_depth=5)),('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=7))]
model_perf(name='Voting Classifier',data='Train',model=VotingClassifier(base_estimators),trainx=trainx,trainy=trainy,
          testx=trainx,testy=trainy)
model_perf(name='Voting Classifier',data='Test',model=VotingClassifier(base_estimators),trainx=trainx,trainy=trainy,
          testx=testx,testy=testy)

In [82]:
## Stacking algorithm
base_estimators = [('Naive Bayes',GaussianNB()),
                  ('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                   
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=5))]

model_perf(name='Stacking-logreg',data='Train',model=StackingClassifier(estimators=base_estimators),trainx=trainx,trainy=trainy,
          testx=trainx,testy=trainy)
model_perf(name='Stacking-logreg',data='Test',model=StackingClassifier(estimators=base_estimators),trainx=trainx,trainy=trainy,
          testx=testx,testy=testy)

In [84]:
## Stacking algorithm
base_estimators = [('Naive Bayes',GaussianNB()),
                  ('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                   
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=5))]
final_estimator = GaussianNB()
model_perf(name='Stacking-naivebayes',data='Train',model=StackingClassifier(estimators=base_estimators,final_estimator=final_estimator),trainx=trainx,trainy=trainy,
          testx=trainx,testy=trainy)
model_perf(name='Stacking-naivebayes',data='Test',model=StackingClassifier(estimators=base_estimators,final_estimator=final_estimator),trainx=trainx,trainy=trainy,
          testx=testx,testy=testy)

In [89]:
## Stacking algorithm
base_estimators = [('Naive Bayes',GaussianNB()),
                   
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=7))]

model_perf(name='Stacking-logreg',data='Train',model=StackingClassifier(estimators=base_estimators),trainx=trainx,trainy=trainy,
          testx=trainx,testy=trainy)
model_perf(name='Stacking-logreg',data='Test',model=StackingClassifier(estimators=base_estimators),trainx=trainx,trainy=trainy,
          testx=testx,testy=testy)

In [90]:
perf_table

Unnamed: 0,Model,Test_Data,Accuracy,Precision,Recall,F1 Score,Cohen Kappa Score
0,Decision Tree,Train,0.858146,0.858333,0.754579,0.803119,0.692967
1,Decision Tree,Test,0.810056,0.786885,0.695652,0.738462,0.590224
2,Naive Bayes,Train,0.787921,0.790476,0.608059,0.687371,0.530999
3,Naive Bayes,Test,0.782123,0.788462,0.594203,0.677686,0.517987
4,Random Forest,Train,0.894663,0.926724,0.787546,0.851485,0.770705
5,Random Forest,Test,0.849162,0.85,0.73913,0.790698,0.673688
6,Adaboost,Train,0.853933,0.816479,0.798535,0.807407,0.689784
7,Adaboost,Test,0.815642,0.757143,0.768116,0.76259,0.611918
8,Gradient Boosting,Train,0.978933,0.992366,0.952381,0.971963,0.955101
9,Gradient Boosting,Test,0.821229,0.784615,0.73913,0.761194,0.61854


In [92]:
def model_validation(model,trainx,trainy,testx,testy):
    m = model
    m.fit(trainx,trainy)
    ypred = m.predict(testx)
    print('1.Confusion Matrix:\n',confusion_matrix(testy,ypred))
    print('2.Classification report:\n',classification_report(testy,ypred))
    print('3.Cohen Kappa Score:',cohen_kappa_score(testy,ypred))
    print('4.Accuracy Score:',accuracy_score(testy,ypred))

In [88]:
model_validation(GaussianNB(),trainx,trainy,trainx,trainy)

1.Confusion Matrix:
 [[395  44]
 [107 166]]
2.Classification report:
               precision    recall  f1-score   support

           0       0.79      0.90      0.84       439
           1       0.79      0.61      0.69       273

    accuracy                           0.79       712
   macro avg       0.79      0.75      0.76       712
weighted avg       0.79      0.79      0.78       712

3.Cohen Kappa Score: 0.5309986215079656
4.Accuracy Score: 0.7879213483146067


In [89]:
tuned_paramaters = [{'criterion': ['entropy','gini'],
                     'max_depth': [5,10],  # 5,6,7,
                     'max_features': ["sqrt", "log2"], # it can be either square root of the feature or log of number of features
                     'min_samples_split': [2,5,8], # 1-3% of the total records
                     'min_samples_leaf': [1,5,9], # 1-3% of the min_sample_split
                     'max_leaf_nodes': [5,8]
                     }]

dt = DecisionTreeClassifier(random_state = 10)

tree_grid = GridSearchCV(estimator = dt, # The estimator can be is logistic, Linear reg, decision tree
                         param_grid = tuned_paramaters,
                         cv = 5)

In [90]:
tree_grid_model = tree_grid.fit(trainx, trainy)
print('Best parameters for decision tree classifier: ', tree_grid_model.best_params_, '\n')

Best parameters for decision tree classifier:  {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 8, 'min_samples_leaf': 1, 'min_samples_split': 2} 



In [91]:
model_validation(DecisionTreeClassifier(criterion='gini',max_depth=5,max_features='sqrt',\
                                        max_leaf_nodes=8,min_samples_leaf=1,min_samples_split=2)\
                 ,trainx,trainy,testx,testy)

1.Confusion Matrix:
 [[102   8]
 [ 17  52]]
2.Classification report:
               precision    recall  f1-score   support

           0       0.86      0.93      0.89       110
           1       0.87      0.75      0.81        69

    accuracy                           0.86       179
   macro avg       0.86      0.84      0.85       179
weighted avg       0.86      0.86      0.86       179

3.Cohen Kappa Score: 0.6978596988724597
4.Accuracy Score: 0.8603351955307262


In [92]:
model_validation(RandomForestClassifier(n_estimators=200,max_depth=7),trainx,trainy,testx,testy)

1.Confusion Matrix:
 [[101   9]
 [ 19  50]]
2.Classification report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88       110
           1       0.85      0.72      0.78        69

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179

3.Cohen Kappa Score: 0.6606635071090048
4.Accuracy Score: 0.8435754189944135


In [94]:
gb = GaussianNB()
gb.fit(x1,y)
pred = gb.predict(newtest1)

In [110]:
sm = SMOTE()
trainx_os , trainy_os = sm.fit_resample(trainx,trainy)

In [125]:
base_estimators = [('Naive Bayes',GaussianNB()),
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=6))]
model_validation(StackingClassifier(estimators=base_estimators),trainx_os,trainy_os,testx,testy)

1.Confusion Matrix:
 [[101   9]
 [ 15  54]]
2.Classification report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89       110
           1       0.86      0.78      0.82        69

    accuracy                           0.87       179
   macro avg       0.86      0.85      0.86       179
weighted avg       0.87      0.87      0.86       179

3.Cohen Kappa Score: 0.7123342707914826
4.Accuracy Score: 0.8659217877094972


In [95]:
submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':pred})
submission.to_csv('naivebayes.csv',index=False)

In [99]:
rf = RandomForestClassifier(n_estimators=200,max_depth=7)
rf.fit(x1,y)
pred = rf.predict(newtest1)

submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':pred})
submission.to_csv('randomforest.csv',index=False)

In [88]:
st = StackingClassifier(estimators=base_estimators)
st.fit(x1,y)
pred = st.predict(newtest1)
submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':pred})
submission.to_csv('stacking2.csv',index=False)

#### Apply Cat-Boost Algorithm:

In [100]:
trainx,testx,trainy,testy = train_test_split(x,y,train_size=0.80,random_state=10)

In [111]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(cat_features=['Sex','Embarked','Cabin_cat','Titles','Family_cat'],\
                         verbose=200,n_estimators=2000,max_depth=16)
model_validation(cat,trainx,trainy,trainx,trainy)

Learning rate set to 0.00472
0:	learn: 0.6899940	total: 52.5ms	remaining: 1m 45s
200:	learn: 0.4257966	total: 54.1s	remaining: 8m 4s
400:	learn: 0.3382171	total: 3m 10s	remaining: 12m 39s
600:	learn: 0.2895074	total: 5m 53s	remaining: 13m 42s
800:	learn: 0.2586773	total: 8m 24s	remaining: 12m 34s
1000:	learn: 0.2350601	total: 11m 8s	remaining: 11m 6s
1200:	learn: 0.2154215	total: 14m 25s	remaining: 9m 35s
1400:	learn: 0.1969423	total: 17m 58s	remaining: 7m 41s
1600:	learn: 0.1772777	total: 21m 14s	remaining: 5m 17s
1800:	learn: 0.1579872	total: 24m 49s	remaining: 2m 44s
1999:	learn: 0.1387510	total: 29m 9s	remaining: 0us
1.Confusion Matrix:
 [[412  20]
 [ 53 227]]
2.Classification report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92       432
           1       0.92      0.81      0.86       280

    accuracy                           0.90       712
   macro avg       0.90      0.88      0.89       712
weighted avg       0.90      

In [112]:
cat = CatBoostClassifier(cat_features=['Sex','Embarked','Cabin_cat','Titles','Family_cat'],\
                         verbose=200,n_estimators=1200,max_depth=12)

cat.fit(x,y)
pred = cat.predict(newtest)
submission = pd.DataFrame({'PassengerId':test.PassengerId,
                          'Survived':pred})
submission.to_csv('catboost.csv',index=False)

Learning rate set to 0.008297
0:	learn: 0.6869217	total: 381ms	remaining: 7m 37s
200:	learn: 0.3531255	total: 23.2s	remaining: 1m 55s
400:	learn: 0.2840996	total: 48s	remaining: 1m 35s
600:	learn: 0.2464967	total: 1m 11s	remaining: 1m 10s
800:	learn: 0.2165758	total: 1m 37s	remaining: 48.5s
1000:	learn: 0.1892011	total: 2m	remaining: 24s
1199:	learn: 0.1640716	total: 2m 26s	remaining: 0us
