In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
train_dataset = pd.read_csv('train.csv')

In [3]:
test_dataset = pd.read_csv('test.csv')

In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
print(train_dataset['Survived'].value_counts())
print(train_dataset['Sex'].value_counts())
print(train_dataset['Embarked'].value_counts())
print(train_dataset['Pclass'].value_counts())

0    549
1    342
Name: Survived, dtype: int64
male      577
female    314
Name: Sex, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64
3    491
1    216
2    184
Name: Pclass, dtype: int64


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DatasetSelector(BaseEstimator,TransformerMixin):
    
    def __init__(self, attr_names):
        self.attr_names = attr_names
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        return X[self.attr_names]

## Numeric Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('select_num',DatasetSelector(["Age", "SibSp", "Parch", "Fare"])),
    ('impute',SimpleImputer(strategy='mean'))
])
num_pipeline.fit_transform(train_dataset)[:10]

array([[22.        ,  1.        ,  0.        ,  7.25      ],
       [38.        ,  1.        ,  0.        , 71.2833    ],
       [26.        ,  0.        ,  0.        ,  7.925     ],
       [35.        ,  1.        ,  0.        , 53.1       ],
       [35.        ,  0.        ,  0.        ,  8.05      ],
       [29.69911765,  0.        ,  0.        ,  8.4583    ],
       [54.        ,  0.        ,  0.        , 51.8625    ],
       [ 2.        ,  3.        ,  1.        , 21.075     ],
       [27.        ,  0.        ,  2.        , 11.1333    ],
       [14.        ,  1.        ,  0.        , 30.0708    ]])

## Categorical Pipeline

### Cant use SimpleImputer for categorical data 

In [8]:
class MostFreqImputer(BaseEstimator, TransformerMixin):
    
    def fit(self,X,y=None):
        self.most_freq = pd.Series([X[count].value_counts().index[0] for count in X],index=X.columns)
        return self
    
    def transform(self,X,y=None):
        return X.fillna(self.most_freq)

In [9]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('select_cat',DatasetSelector(["Pclass", "Sex", "Embarked"])),
    ('impute',MostFreqImputer()),
    ('cat_encode',OneHotEncoder(sparse=False)),
])
cat_pipeline.fit_transform(train_dataset)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [10]:
from sklearn.pipeline import FeatureUnion

# combine both the pipeline
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [11]:
X_train = full_pipeline.fit_transform(train_dataset)
X_train[:4]

array([[22.    ,  1.    ,  0.    ,  7.25  ,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ],
       [38.    ,  1.    ,  0.    , 71.2833,  1.    ,  0.    ,  0.    ,
         1.    ,  0.    ,  1.    ,  0.    ,  0.    ],
       [26.    ,  0.    ,  0.    ,  7.925 ,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ],
       [35.    ,  1.    ,  0.    , 53.1   ,  1.    ,  0.    ,  0.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ]])

In [12]:
y_train = list(train_dataset['Survived'])
y_train[:10]

[0, 1, 1, 1, 0, 0, 0, 0, 1, 1]

In [13]:
from sklearn.model_selection import cross_val_score

## Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8115355805243445

## XgBoost

In [28]:
from xgboost import XGBClassifier

xg_clf = XGBClassifier(random_state=42)
xg_scores = cross_val_score(xg_clf, X_train, y_train, cv=10)
xg_scores.mean()

0.8170911360799

In [16]:
xg_clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Hyperparameter Tuning

In [23]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_grid

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [24]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.0min finished


{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

In [35]:
best_random = rf_random.best_estimator_
ran_scores = cross_val_score(best_random, X_train, y_train, cv=10)
ran_scores.mean()

0.8316978776529338

In [44]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'min_samples_leaf': [1, 3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'max_depth': [None],
    'max_features': ['sqrt'],
    'n_estimators': [100,200,300,400,500,600,700,800,900]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [45]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.1min finished


{'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 300}

In [46]:
best_grid = grid_search.best_estimator_
grid_scores = cross_val_score(best_grid, X_train, y_train, cv=10)
grid_scores.mean()

0.8294756554307117