# **Titanic - Machine Learning from Disaster**
Used **pipelines** to improve the efficiency of machine learning code.

# 1) Create X and Y dataframes

## Import Data Analytics libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Import Machine Learning Libraries

In [2]:
from sklearn.pipeline import Pipeline

# To perform operations on columns:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# ML algorithms:
from xgboost import XGBClassifier

# To evaluate performance model:
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_val_score
from sklearn.metrics import mean_absolute_error, confusion_matrix, accuracy_score

## Get file paths
Running this will list all files under the input directory

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Read the data

In [4]:
X_y = pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')

##  Separate target from predictors

In [5]:
# First remove rows with missing target
X_y.dropna(axis=0, subset=['Survived'], inplace=True)

y = X_y.Survived
X = X_y.drop(['Survived'], axis=1)

# 2) Get summary info of X and y 

In [6]:
X.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [9]:
y.head(2)

PassengerId
1    0
2    1
Name: Survived, dtype: int64

In [10]:
y.describe()

count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

# 3) Cleaning the data

## Remove the columns with more than half missing values

In [11]:
null_values = X.isnull().sum()
null_values[null_values>0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [12]:
null_values_test = X_test.isnull().sum()
null_values_test[null_values_test>0]

Age       86
Fare       1
Cabin    327
dtype: int64

In [13]:
null_columns = [col for col in X.columns if X[col].isnull().sum() > X.shape[0]/2]
null_columns

['Cabin']

In [14]:
X.drop(null_columns, axis=1, inplace=True)
X_test.drop(null_columns, axis=1, inplace=True)

# 4) Feature Selection

## Select categorical columns with relatively low cardinality (convenient but arbitrary)

In [15]:
# "Cardinality" means the number of unique values in a column
categorical_cols = [cname for cname in X.columns 
                    if X[cname].nunique() < 10 and
                    X[cname].dtype == "object"]

## Select numerical columns

In [16]:
numerical_cols = [cname for cname in X.columns 
                  if X[cname].dtype in ['int64', 'float64']]

## Keep selected columns only

In [17]:
my_cols = categorical_cols + numerical_cols
X = X[my_cols]
X_test = X_test[my_cols]

In [18]:
X.head(2)

Unnamed: 0_level_0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,male,S,3,22.0,1,0,7.25
2,female,C,1,38.0,1,0,71.2833


# Imputing Age

In [19]:
print(X["Age"].isnull().sum(),X_test["Age"].isnull().sum())

177 86


In [20]:
avg_ages = X.groupby(['Pclass','Sex'])['Age'].mean()
avg_ages

Pclass  Sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: Age, dtype: float64

In [21]:
for i in X.Pclass.unique():
    for j in X.Sex.unique():
        value=round(avg_ages.loc[i,j])
        print(value,end=', ')
        X.loc[(X.Pclass==i) & (X.Sex==j) & (X.Age.isnull()), ['Age']] = value
        X_test.loc[(X_test.Pclass==i) & (X_test.Sex==j) & (X_test.Age.isnull()), ['Age']] = value

27, 22, 41, 35, 31, 29, 

In [22]:
print(X["Age"].isnull().sum(),X_test["Age"].isnull().sum())

0 0


# 5) Model Creation

## Preprocessing Pipelines

### Preprocessing for numerical data

In [23]:
numerical_transformer = SimpleImputer(strategy='median')

### Preprocessing for categorical data

In [24]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Bundle preprocessing for numerical and categorical data

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Model Pipeline

In [26]:
# Create object of XGBClassifier class
xgb = XGBClassifier()

# Bundle preprocessing and modeling code in a pipeline
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', xgb)
                     ])

## Visualize the pipeline

In [27]:
from sklearn import set_config
set_config(display='diagram')
classifier

# 6) Training and Testing Model

## Grid Search & Cross Validation

In [28]:
# Hyper-Parameters that can be optimised using GridSearch
classifier.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'model', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__add_indicator', 'preprocessor__num__copy', 'preprocessor__num__fill_value', 'preprocessor__num__missing_values', 'preprocessor__num__strategy', 'preprocessor__num__verbose', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__imputer', 'preprocessor__cat__onehot', 'preprocessor__cat__imputer__add_indicator', 'preprocessor__cat__imputer__copy', 'preprocessor__cat__imputer__fill_value', 'preprocessor__cat__imputer__missing_values', 'preprocessor__cat__imputer__strategy', 'preprocessor__cat__imputer__verbose', 'preprocessor__cat__onehot__categories', 'preprocessor__cat__onehot__drop', 'preproc

In [29]:
param_grid = [
    {        
        "model__learning_rate": [.03, .02, .01],
        "model__n_estimators": [100, 200, 300],
        "model__max_depth": [5, 6, 7]
    }
]
grid_search = GridSearchCV(classifier, param_grid, cv=4, verbose=3)
grid_search.fit(X, y);

Fitting 4 folds for each of 27 candidates, totalling 108 fits
[CV 1/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=100;, score=0.794 total time=   0.5s
[CV 2/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=100;, score=0.843 total time=   0.5s
[CV 3/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=100;, score=0.839 total time=   0.5s
[CV 4/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=100;, score=0.847 total time=   0.5s
[CV 1/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=200;, score=0.789 total time=   0.8s
[CV 2/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=200;, score=0.843 total time=   0.9s
[CV 3/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=200;, score=0.825 total time=   0.9s
[CV 4/4] END model__learning_rate=0.03, model__max_depth=5, model__n_estimators=200;, score=0.851 total time=   0.9s
[C

In [30]:
print("Best params:")
print(grid_search.best_params_)
print("Best score in grid search:")
print(grid_search.best_score_)
print("Best XGBoost on whole trained data:")
print(grid_search.score(X, y))

Best params:
{'model__learning_rate': 0.01, 'model__max_depth': 7, 'model__n_estimators': 300}
Best score in grid search:
0.8384034258473722
Best XGBoost on whole trained data:
0.8945005611672279


## Create Confusion Matrix (for trained data)

In [31]:
y_pred = grid_search.predict(X)
cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

[[522  27]
 [ 67 275]]


0.8945005611672279

# 7) Predicting using Model

## Generate test predictions

In [32]:
# Preprocessing of test data, fit model
preds_test = grid_search.predict(X_test)

## Save output to CSV file

In [33]:
output = pd.DataFrame({'PassengerId': X_test.index,
                       'Survived': preds_test})
output.to_csv('submission.csv', index=False)

## Submit your results

In [34]:
submission_data = pd.read_csv("submission.csv")
submission_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
