In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Titanic.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
x = df.drop('Survived',axis = 1)

In [6]:
y = df['Survived']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train , x_test , y_train , y_test = train_test_split(
    x,y,test_size=0.2,stratify=y
)

In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 667 to 349
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          571 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        164 non-null    object 
 10  Embarked     711 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [10]:
num_attributes = x_train.select_dtypes(include=(np.number)).columns

In [11]:
cat_attributes = x_train.select_dtypes(exclude=(np.number)).columns

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , StandardScaler

In [13]:
num_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy="mean")),
    ("scaler",StandardScaler())
])

In [14]:
cat_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy='constant',fill_value="Unknown")),
    ("OHE",OneHotEncoder(handle_unknown="ignore"))
])

In [15]:
preprocessor = ColumnTransformer([
    ("num",num_pipeline,num_attributes),
    ("cat",cat_pipeline,cat_attributes)
])

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
model = Pipeline([
    ("preprocessor",preprocessor),
    ("model",LogisticRegression(max_iter=1000))
])

In [18]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [19]:
from sklearn.metrics import classification_report

In [20]:
y_pred = model.predict(x_test)

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       110
           1       0.74      0.72      0.73        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [22]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[92 18]
 [19 50]]


In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        random_state=42
    ))
])

rf_model.fit(x_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
y_pred_rf = rf_model.predict(x_test)

print("Confusion Matrix (RF):")
print(confusion_matrix(y_test, y_pred_rf))

print("\nClassification Report (RF):")
print(classification_report(y_test, y_pred_rf))


Confusion Matrix (RF):
[[108   2]
 [ 43  26]]

Classification Report (RF):
              precision    recall  f1-score   support

           0       0.72      0.98      0.83       110
           1       0.93      0.38      0.54        69

    accuracy                           0.75       179
   macro avg       0.82      0.68      0.68       179
weighted avg       0.80      0.75      0.72       179



In [26]:
from sklearn.metrics import roc_auc_score

In [27]:
y_prob = rf_model.predict_proba(x_test)[:,1]
print("Roc auc: ",roc_auc_score(y_test,y_prob))

Roc auc:  0.8172595520421607


In [28]:
one_hot_encoder = rf_model.named_steps["preprocessor"]\
                          .named_transformers_["cat"]\
                          .named_steps["OHE"]


In [29]:
cat_names = one_hot_encoder.get_feature_names_out(cat_attributes)
features_name = np.concatenate([num_attributes,cat_names])

In [30]:
importances = rf_model.named_steps["model"].feature_importances_

In [31]:
for name , score in sorted(
    zip(features_name,importances),
    key = lambda x:x[1],
    reverse = True
):
    print(name, round(score ,3))

Sex_male 0.122
Sex_female 0.106
Fare 0.071
Pclass 0.054
Cabin_Unknown 0.053
Parch 0.033
Age 0.029
SibSp 0.024
PassengerId 0.014
Embarked_C 0.013
Embarked_S 0.011
Ticket_110152 0.008
Ticket_PC 17572 0.007
Ticket_113760 0.007
Cabin_E101 0.007
Cabin_B96 B98 0.006
Cabin_B20 0.006
Cabin_B49 0.006
Cabin_C52 0.005
Ticket_347082 0.005
Ticket_SC 1748 0.004
Ticket_29106 0.004
Ticket_2666 0.004
Cabin_B77 0.004
Ticket_19996 0.004
Ticket_PC 17611 0.003
Ticket_11751 0.003
Name_Smith, Miss. Marion Elsie 0.003
Name_Sjoblom, Miss. Anna Sofia 0.003
Ticket_17474 0.003
Name_Masselmani, Mrs. Fatima 0.003
Ticket_CA 2144 0.003
Ticket_PP 9549 0.003
Name_Kelly, Mrs. Florence "Fannie" 0.003
Ticket_24160 0.003
Cabin_F33 0.003
Cabin_D33 0.003
Ticket_16966 0.003
Name_Hassab, Mr. Hammad 0.003
Name_Carter, Mr. William Ernest 0.003
Ticket_250644 0.003
Name_Astor, Mrs. John Jacob (Madeleine Talmadge Force) 0.003
Cabin_B79 0.003
Cabin_D36 0.002
Ticket_C.A. 31921 0.002
Ticket_36928 0.002
Ticket_PC 17485 0.002
Ticket_119