In [34]:
import pandas as pd

# Preprocessing

## Load Data

In [35]:
df = pd.read_csv('datasets/train.csv', index_col='PassengerId')
df.drop("Name", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.pop("Cabin")
df.head(10)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S
6,0,3,male,,0,0,8.4583,Q
7,0,1,male,54.0,0,0,51.8625,S
8,0,3,male,2.0,3,1,21.075,S
9,1,3,female,27.0,0,2,11.1333,S
10,1,2,female,14.0,1,0,30.0708,C


In [36]:
classCol = df.pop("Survived")
classCol

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

## Obtain Categorical and Numerical Columns

In [37]:
categoricalCols = df.columns[df.dtypes == 'object']
categoricalCols

Index(['Sex', 'Embarked'], dtype='object')

In [38]:
categoricalCols = df.select_dtypes("object").columns
categoricalCols

Index(['Sex', 'Embarked'], dtype='object')

In [39]:
dfNum = df.drop(categoricalCols, axis = 1)
dfCat = df[categoricalCols]

## Process nulls in Categorical Variables

In [40]:
from sklearn.impute import SimpleImputer

impCat = SimpleImputer(strategy = 'most_frequent')
columns = dfCat.columns
index = dfCat.index
dfCat = pd.DataFrame(impCat.fit_transform(dfCat), columns = columns, index = index)

In [41]:
dfCat.isnull().any()

Sex         False
Embarked    False
dtype: bool

In [42]:
dfCat

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


## Convert Categorical into Numerical

In [43]:
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)

dfCatLe = dfCat.apply(lambda col: d[col.name].fit_transform(col))
dfCatLe.head(10)

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,0,0
3,0,2
4,0,2
5,1,2
6,1,1
7,1,2
8,1,2
9,0,2
10,0,0


In [44]:
inverse = dfCatLe.apply(lambda col: d[col.name].inverse_transform(col))
inverse

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [45]:
transformed = inverse.apply(lambda col: d[col.name].transform(col))
transformed

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,0,0
3,0,2
4,0,2
5,1,2
...,...,...
887,1,2
888,0,2
889,0,2
890,1,0


Disadvantage: LabelEncoder introduces a "false order" in categories. We can use OneHotEncoder instead, that fixes this problem.

In [46]:
dfCat

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [47]:
ohe = preprocessing.OneHotEncoder(sparse = False)
dfCatOhe = pd.DataFrame(ohe.fit_transform(dfCat),
                        columns = ohe.get_feature_names_out(dfCat.columns.tolist()),
                        index = dfCat.index)

In [48]:
dfCatOhe

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0
888,1.0,0.0,0.0,0.0,1.0
889,1.0,0.0,0.0,0.0,1.0
890,0.0,1.0,1.0,0.0,0.0


## Process nulls in Numerical Variables

In [49]:
impNum = SimpleImputer(strategy = "mean")
columns = dfNum.columns
index = dfNum.index
dfNum = pd.DataFrame(impNum.fit_transform(dfNum), columns = columns, index = index)
dfNum.isnull().any()

Pclass    False
Age       False
SibSp     False
Parch     False
Fare      False
dtype: bool

In [50]:
dfNum

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.0,22.000000,1.0,0.0,7.2500
2,1.0,38.000000,1.0,0.0,71.2833
3,3.0,26.000000,0.0,0.0,7.9250
4,1.0,35.000000,1.0,0.0,53.1000
5,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...
887,2.0,27.000000,0.0,0.0,13.0000
888,1.0,19.000000,0.0,0.0,30.0000
889,3.0,29.699118,1.0,2.0,23.4500
890,1.0,26.000000,0.0,0.0,30.0000


Now, we merge both processed dataframes (categorical and numerical)

In [51]:
dfPreprocessed = pd.merge(left = dfCatOhe,
                          right = dfNum,
                          on = "PassengerId")
dfPreprocessed

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,1.0,0.0,0.0,1.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,0.0,1.0,0.0,0.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,0.0,0.0,0.0,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,0.0,0.0,0.0,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,1.0,0.0,0.0,1.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,0.0,0.0,0.0,1.0,1.0,19.000000,0.0,0.0,30.0000
889,1.0,0.0,0.0,0.0,1.0,3.0,29.699118,1.0,2.0,23.4500
890,0.0,1.0,1.0,0.0,0.0,1.0,26.000000,0.0,0.0,30.0000


# Train & Test Split

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfPreprocessed, classCol, test_size = 0.3, random_state = 1)

print(X_train)

             Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  Pclass  \
PassengerId                                                                     
115                 1.0       0.0         1.0         0.0         0.0     3.0   
875                 1.0       0.0         1.0         0.0         0.0     2.0   
77                  0.0       1.0         0.0         0.0         1.0     3.0   
877                 0.0       1.0         0.0         0.0         1.0     3.0   
675                 0.0       1.0         0.0         0.0         1.0     2.0   
...                 ...       ...         ...         ...         ...     ...   
716                 0.0       1.0         0.0         0.0         1.0     3.0   
768                 1.0       0.0         0.0         1.0         0.0     3.0   
73                  0.0       1.0         0.0         0.0         1.0     2.0   
236                 1.0       0.0         0.0         0.0         1.0     3.0   
38                  0.0     

# Decision Trees

In [53]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(min_samples_split = 5, min_samples_leaf = 3, max_depth = 3, random_state = 1)
tree.fit(X_train, y_train)
tree

DecisionTreeClassifier(max_depth=3, min_samples_leaf=3, min_samples_split=5,
                       random_state=1)

In [54]:
y_pred = tree.predict(X_test)
y_pred

array([1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0], dtype=int64)

In [56]:
from sklearn.metrics import confusion_matrix

conf = pd.DataFrame(
        confusion_matrix(y_test, y_pred),
        columns = ["Predicted 0", "Predicted 1"],
        index = ["Actual 0", "Actual 1"]
)
conf

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,141,12
Actual 1,49,66


# Cross-Validation

In [57]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree, X_train, y_train, cv = 5)
scores

array([0.824     , 0.84      , 0.88      , 0.84677419, 0.83064516])

In [58]:
from sklearn.model_selection import cross_validate

metric = ["precision", "recall_macro", "recall_weighted"]
scores = cross_validate(tree, X_train, y_train, scoring = metric, cv = 5)
scores

{'fit_time': array([0.02070999, 0.01135755, 0.00799608, 0.01570344, 0.00962114]),
 'score_time': array([0.023525  , 0.01785922, 0.01878738, 0.01869321, 0.00840569]),
 'test_precision': array([0.87096774, 0.80952381, 0.94285714, 0.86111111, 0.85294118]),
 'test_recall_macro': array([0.775     , 0.81893231, 0.84603742, 0.81279887, 0.79057665]),
 'test_recall_weighted': array([0.824     , 0.84      , 0.88      , 0.84677419, 0.83064516])}

# Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100, random_state = 1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

conf = pd.DataFrame(
        confusion_matrix(y_test, y_pred_rf),
        columns = ["Predicted 0", "Predicted 1"],
        index = ["True 0", "True 1"]
)

conf

Unnamed: 0,Predicted 0,Predicted 1
True 0,136,17
True 1,43,72


# SVM

In [66]:
from sklearn.svm import SVC

svm = SVC(C=10.0)
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

conf = pd.DataFrame(
    confusion_matrix(y_test, y_pred_svm),
    columns = ["Predicted 0", "Predicted 1"],
    index = ["True 0", "True 1"]
)

conf

Unnamed: 0,Predicted 0,Predicted 1
True 0,138,15
True 1,69,46


In [72]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.74      0.93      0.83       153
           1       0.87      0.57      0.68       115

    accuracy                           0.78       268
   macro avg       0.80      0.75      0.76       268
weighted avg       0.79      0.78      0.77       268



# SVM Scaled

In [69]:
from sklearn.preprocessing import MinMaxScaler

nms = MinMaxScaler()
dfNumSc = pd.DataFrame(nms.fit_transform(dfNum), columns = dfNum.columns, index = dfNum.index)
dfPreprocessedSc = pd.merge(left = dfCatOhe, right = dfNumSc, on="PassengerId")

X_train_sc, X_test_sc, y_train_sc, y_test_sc = train_test_split(dfPreprocessedSc, classCol, test_size = 0.3, random_state = 1)

svm.fit(X_train_sc, y_train_sc)
y_pred_svm = svm.predict(X_test_sc)

conf = pd.DataFrame(
        confusion_matrix(y_test_sc, y_pred_svm),
        columns = ["Predicted 0", "Predicted 1"],
        index = ["True 0", "True 1"]
)

conf

Unnamed: 0,Predicted 0,Predicted 1
True 0,143,10
True 1,50,65


In [71]:
print(classification_report(y_test_sc, y_pred_svm))

              precision    recall  f1-score   support

           0       0.74      0.93      0.83       153
           1       0.87      0.57      0.68       115

    accuracy                           0.78       268
   macro avg       0.80      0.75      0.76       268
weighted avg       0.79      0.78      0.77       268



# Naive-Bayes

In [74]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb_gaussian = nb.predict(X_test)

conf = pd.DataFrame(
    confusion_matrix(y_test, y_pred_nb_gaussian),
    columns = ["Predicted 0", "Predicted 1"],
    index = ["True 0", "True 1"]
)

conf

Unnamed: 0,Predicted 0,Predicted 1
True 0,126,27
True 1,36,79
