In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score

## Loading Dataset

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv', index_col= 0)

In [3]:
X_test = pd.read_csv('/kaggle/input/titanic/test.csv', index_col= 0)
y_test = pd.read_csv('/kaggle/input/titanic/gender_submission.csv', index_col= 0)

In [4]:
train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Analysis

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
train.describe(exclude= 'object')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [8]:
train['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [9]:
train['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [10]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
train['SibSp'].value_counts()

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [12]:
train['Parch'].value_counts()

Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

## Preprocessing

In [13]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler())
])

In [14]:
cat_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('onehot', OneHotEncoder(sparse_output= False))
])

In [15]:
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
cat_features = ['Pclass', 'Sex', 'Embarked']

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [16]:
X_train = preprocessing.fit_transform(train)

In [17]:
X_train

array([[-0.56573646,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.66386103,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [18]:
preprocessing.get_feature_names_out()

array(['num__Age', 'num__SibSp', 'num__Parch', 'num__Fare',
       'cat__Pclass_0.0', 'cat__Pclass_1.0', 'cat__Pclass_2.0',
       'cat__Sex_0.0', 'cat__Sex_1.0', 'cat__Embarked_0.0',
       'cat__Embarked_1.0', 'cat__Embarked_2.0'], dtype=object)

In [19]:
y_train = train['Survived']

## Testing Models

In [20]:
rnd_forest_clf = RandomForestClassifier(random_state= 42)

In [21]:
rnd_forest_clf.fit(X_train, y_train)

In [22]:
forest_scores = cross_val_score(rnd_forest_clf, X_train, y_train, cv= 10)

In [23]:
forest_scores

array([0.74444444, 0.79775281, 0.75280899, 0.80898876, 0.87640449,
       0.83146067, 0.83146067, 0.7752809 , 0.83146067, 0.84269663])

In [24]:
forest_scores.mean()

0.8092759051186016

In [25]:
svc = SVC()

In [26]:
svc.fit(X_train, y_train)

In [27]:
svc_scores = cross_val_score(svc, X_train, y_train, cv= 10)

In [28]:
svc_scores.mean()

0.8249438202247191

In [29]:
knn_clf = KNeighborsClassifier()

In [30]:
knn_clf.fit(X_train, y_train)

In [31]:
knn_scores = cross_val_score(knn_clf, X_train, y_train, cv= 10)

In [32]:
knn_scores.mean()

0.8069912609238452

It seems that SVM works better than RandomForest and KNN in this case.

## On test set

In [33]:
X_test = preprocessing.fit_transform(X_test)

In [34]:
cross_val_score(svc, X_test, y_test.values.ravel(), cv= 10)

array([1.        , 1.        , 1.        , 0.95238095, 0.97619048,
       1.        , 1.        , 1.        , 0.95121951, 1.        ])

In [35]:
y_pred = svc.predict(X_test)

In [36]:
confusion_matrix(y_test, y_pred)

array([[255,  11],
       [ 15, 137]])

In [37]:
f1_score(y_test, y_pred)

0.9133333333333332