# Titanic Servival Prediction

In [13]:
# import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest,chi2

In [2]:
# Load the data

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# drop the collumn which we are not interested in
df.drop(columns=['deck', 'embark_town', 'alive'], axis=1, inplace=True)

In [4]:
# train test split
X = df.drop('survived', axis=1)
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   pclass      712 non-null    int64   
 1   sex         712 non-null    object  
 2   age         572 non-null    float64 
 3   sibsp       712 non-null    int64   
 4   parch       712 non-null    int64   
 5   fare        712 non-null    float64 
 6   embarked    710 non-null    object  
 7   class       712 non-null    category
 8   who         712 non-null    object  
 9   adult_male  712 non-null    bool    
 10  alone       712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(3), object(3)
memory usage: 52.3+ KB


### Pipeline-> missing value - ohe - scalling - feature selection(if needed) - model selection


In [6]:
# step 1
# impute the missing value
trf1 = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), [2]),
        ('cat', SimpleImputer(strategy='most_frequent'), [6])
    ],
    remainder='passthrough'
)

In [7]:
# step 2
# encode the categorical data
trf2 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), [0, 1,2,3,4,5,6,7,8,9,10]),
],remainder='passthrough')

In [8]:
# step 3
# Scalling 
trf3 = ColumnTransformer([
    ('scaler', MinMaxScaler(),[2,5])
], remainder='passthrough')


In [33]:
# trf4 = SelectKBest(score_func=chi2, k=12)

In [34]:
trf5 = DecisionTreeClassifier()

## Create the Pipeline

In [35]:
# Create the pipeline
pipe = Pipeline(
    [('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    # ('trf4', trf4),
    ('trf5', trf5)

])


In [36]:
# train the pipeline
pipe.fit(X_train, y_train)

In [38]:
# select the best selected features
# get the feature names after one hot encoding
# ohe_feature_names = pipe.named_steps['trf2'].named_transformers_['ohe'].get_feature_names_out(X_train.columns)
# get the support from SelectKBest
# selected_features = ohe_feature_names[pipe.named_steps['trf4'].get_support()]
# selected_features

In [39]:
# predict the model
y_pred = pipe.predict(X_test)

# print the result
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Mean Squared Error: 0.19553072625698323
R2 Score: 0.1936936936936937
Accuracy Score: 0.8044692737430168


In [None]:
# cross-validation 
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()



np.float64(0.7781148429035754)

In [41]:
# Exporting the model
import pickle
pickle.dump(pipe, open('titanic_model.pkl', 'wb'))