In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
data=pd.read_csv('../input/titanicdataset-traincsv/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Cardinality - The number/variety of values that a feature can assume
# For eg : City, Pincode etc

In [5]:
# Problems with high cardinality
# Varibales with too many labels tend to dominate over variables with less labels
# There is a chance that some categories of this feature maybe present only in train set and not test set and vice versa
# Introduces noise

In [6]:
len(data['Sex'].unique()),len(data['Cabin'].unique())

(2, 148)

In [7]:
# Sex - Low cardinality
# Cabin - High cardinality 

In [8]:
# Now, when cardinality ia high, we cannot do One Hot encoding, as this will lead to huge number of features

In [9]:
data[['Cabin','Sex']]

Unnamed: 0,Cabin,Sex
0,,male
1,C85,female
2,,female
3,C123,female
4,,male
...,...,...
886,,male
887,B42,female
888,,female
889,C148,male


In [10]:
# Now make, all cabin values starting with A into category A and so on

In [11]:
data['Cabin'].str[0].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [12]:
data['Cabin'].fillna('N').str[0].unique()

array(['N', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [13]:
# We see that, the cardinality has been reduced from 148 --> 9

In [14]:
data['Cabin_reduced']=data['Cabin'].fillna('N').str[0]

In [15]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_reduced
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,N
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,N
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,N


In [16]:
len(data['Cabin_reduced'].unique()) 

9

In [17]:
use_cols = ['Cabin','Cabin_reduced','Sex']
X_train, X_test, y_train, y_test = train_test_split(data[use_cols],
                                                   data['Survived'],
                                                   test_size=0.4,
                                                   random_state=0 )

In [18]:
X_train.shape, X_test.shape

((534, 3), (357, 3))

In [19]:
train_cabin_unique=X_train['Cabin'].unique()
test_cabin_unique=X_test['Cabin'].unique()
print(len(train_cabin_unique),len(test_cabin_unique))

104 69


In [20]:
# Categories in Train and not in test
# Leads to overfitting
train_not_test=[i for i in train_cabin_unique if i not in test_cabin_unique]
print('Num of categories in train and not in test',len(train_not_test))
print(train_not_test)

Num of categories in train and not in test 80
[nan, 'B73', 'E36', 'E46', 'C111', 'E101', 'D15', 'E12', 'A32', 'B4', 'A10', 'A5', 'C95', 'E25', 'C90', 'D6', 'A36', 'D', 'D50', 'C93', 'E77', 'C101', 'D11', 'C123', 'C32', 'B35', 'C91', 'T', 'B101', 'E58', 'A23', 'B77', 'D28', 'B82 B84', 'B79', 'C45', 'C2', 'B5', 'C104', 'B20', 'A19', 'B51 B53 B55', 'B80', 'B38', 'B22', 'C22 C26', 'A16', 'D47', 'B28', 'E10', 'C46', 'B39', 'D30', 'C50', 'A34', 'C110', 'D19', 'B86', 'D35', 'C99', 'D46', 'F38', 'A24', 'D7', 'C103', 'A31', 'C82', 'F G63', 'A6', 'E50', 'B3', 'C118', 'C70', 'F E69', 'C86', 'D48', 'C49', 'A7', 'A20', 'E49']


In [21]:
# Categories in test and not in train
# Model doesn't know what to do, as it is seeing this category for first time
test_not_train=[i for i in test_cabin_unique if i not in train_cabin_unique]
print('Num of categories in train and not in test',len(test_not_train))
print(test_not_train)

Num of categories in train and not in test 45
[nan, 'B78', 'C106', 'C125', 'C7', 'C54', 'E34', 'B50', 'B41', 'C83', 'B57 B59 B63 B66', 'D45', 'D10 D12', 'C47', 'C87', 'D17', 'B42', 'D9', 'F33', 'B102', 'D49', 'F G73', 'C62 C64', 'B37', 'B58 B60', 'C85', 'B71', 'B30', 'A26', 'B69', 'E68', 'B94', 'E17', 'C128', 'A14', 'B19', 'D21', 'C148', 'C30', 'D56', 'E40', 'E31', 'E38', 'D37', 'E63']


In [22]:
# Let us see the same for Cabin_reduced

In [23]:
len([x for x in X_train['Cabin_reduced'].unique() if x not in X_test['Cabin_reduced'].unique()])

1

In [24]:
len([x for x in X_test['Cabin_reduced'].unique() if x not in X_train['Cabin_reduced'].unique()])

0

In [25]:
'''
Catergorical Encoding
1. Fill missing values
2. One hot encode the features
''' 


'\nCatergorical Encoding\n1. Fill missing values\n2. One hot encode the features\n'

In [26]:
X_train.isnull().sum(),X_test.isnull().sum()

(Cabin            407
 Cabin_reduced      0
 Sex                0
 dtype: int64,
 Cabin            280
 Cabin_reduced      0
 Sex                0
 dtype: int64)

In [27]:
X_train.fillna('0',inplace=True)
X_test.fillna('0',inplace=True)

In [28]:
X_train.isnull().sum(),X_test.isnull().sum()

(Cabin            0
 Cabin_reduced    0
 Sex              0
 dtype: int64,
 Cabin            0
 Cabin_reduced    0
 Sex              0
 dtype: int64)

In [29]:
# First combination
['Cabin','Sex']

['Cabin', 'Sex']

In [30]:
# Firstly we select only columns ['Cabin','Sex']
# Then by specifying the argument columns, we specify that only these columns must be One Hot Encoded
train = pd.get_dummies(X_train[['Cabin','Sex']],columns=['Cabin','Sex'])
test = pd.get_dummies(X_test[['Cabin','Sex']],columns=['Cabin','Sex'])

In [31]:
train.shape, test.shape

((534, 106), (357, 71))

In [32]:
# As we can see, the columns in test and train are not balanced
# Put all cols in train which are not in test into the test set

In [33]:
missing_cols=set(train.columns)-set(test.columns)
for i in missing_cols:
    test[i]=0
test.shape

(357, 150)

In [34]:
# Now select only those cols from test which are present in train

In [35]:
test=test[train.columns]

In [36]:
train.shape, test.shape

((534, 106), (357, 106))

In [37]:
# Model Building

In [47]:
def run_model(data_train, data_test, y_train, y_test):
    rfc = RandomForestClassifier(n_estimators=200,random_state=42)
    abc = AdaBoostClassifier(n_estimators=200,random_state=42)
    lrs = LogisticRegression(random_state=42)
    gbc = GradientBoostingClassifier(n_estimators=300,random_state=42)
    
    models= {
        'Random Forest Classifier' : rfc,
        'AdaBoost Classifier': abc,
        'Logistic Regression' : lrs,
        'Gradient Boosting Classsifier' : gbc
    }
    
    # One hot encoding
    train = pd.get_dummies(data_train, columns = data_train.columns)
    test = pd.get_dummies(data_test, columns = data_test.columns)
    missing_cols=set(train.columns)-set(test.columns)
    for i in missing_cols:
        test[i]=0
    test = test[train.columns]
    
    for classifier,model in models.items():
        model.fit(train,y_train)
        # Difference between predict() and predict_proba() ??
        y_train_pred = model.predict_proba(train)
        y_test_pred = model.predict_proba(test)
        
        print(classifier + ':')
        print('Train set ROC AUC Score : {0}'.format(roc_auc_score(y_train,y_train_pred[:,1])))
        print('Test set ROC AUC Score: {0}'.format(roc_auc_score(y_test,y_test_pred[:,1])))
        print()

In [48]:
# High Cardinality

In [49]:
run_model(X_train[['Cabin','Sex']],X_test[['Cabin','Sex']],y_train, y_test)

Random Forest Classifier:
Train set ROC AUC Score : 0.8729650130239166
Test set ROC AUC Score: 0.8261744743146127

AdaBoost Classifier:
Train set ROC AUC Score : 0.8699310324413924
Test set ROC AUC Score: 0.7899254724514241

Logistic Regression:
Train set ROC AUC Score : 0.8328572697134738
Test set ROC AUC Score: 0.819204152249135

Gradient Boosting Classsifier:
Train set ROC AUC Score : 0.8743858039308549
Test set ROC AUC Score: 0.8247604471652914



In [50]:
# Low Cardinality

In [51]:
run_model(X_train[['Cabin_reduced','Sex']],X_test[['Cabin_reduced','Sex']],y_train, y_test)

Random Forest Classifier:
Train set ROC AUC Score : 0.8310812810798011
Test set ROC AUC Score: 0.8101377428799574

AdaBoost Classifier:
Train set ROC AUC Score : 0.8283284986976084
Test set ROC AUC Score: 0.8166589033803567

Logistic Regression:
Train set ROC AUC Score : 0.8274109045702107
Test set ROC AUC Score: 0.8159602076124568

Gradient Boosting Classsifier:
Train set ROC AUC Score : 0.8310812810798011
Test set ROC AUC Score: 0.8101377428799574



In [None]:
# As seen from above, the overfitting of the model has reduced (the gap b/w train & test ROC AUC score has come down)