## Importing Required Modules

In [1]:
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

## Loading Dataset From the Source

In [2]:
titanic_train = pd.read_csv("https://raw.githubusercontent.com/Guhan-B/Carte-Blanch-Machine-Learning-Workshop/main/titanic_train.csv")

titanic_test  = pd.read_csv("https://raw.githubusercontent.com/Guhan-B/Carte-Blanch-Machine-Learning-Workshop/main/titanic_test.csv")
titanic_test_results  = pd.read_csv("https://raw.githubusercontent.com/Guhan-B/Carte-Blanch-Machine-Learning-Workshop/main/titanic_test_results.csv")

titanic_test = titanic_test[titanic_test["PassengerId"] == titanic_test_results["PassengerId"]]
titanic_test["Survived"] = titanic_test_results["Survived"]

## Exploratory Data Analysis

In [3]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [5]:
print("Size of the training data:", titanic_train.shape[0])
print("Size of the testing data:", titanic_test.shape[0])

Size of the training data: 891
Size of the testing data: 418


In [6]:
print("\nCheck for NULL values in each column in training data \n")
print(titanic_train.isnull().sum(axis = 0))

print("\nCheck for NULL values in each column in testing data \n")
print(titanic_test.isnull().sum(axis = 0))


Check for NULL values in each column in training data 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Check for NULL values in each column in testing data 

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Survived         0
dtype: int64


In [7]:
filtered_titanic_train = titanic_train[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Survived"]]
filtered_titanic_test = titanic_test[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Survived"]]

In [8]:
filtered_titanic_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.25,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.925,1
3,1,female,35.0,1,0,53.1,1
4,3,male,35.0,0,0,8.05,0


In [9]:
filtered_titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,34.5,0,0,7.8292,0
1,3,female,47.0,1,0,7.0,1
2,2,male,62.0,0,0,9.6875,0
3,3,male,27.0,0,0,8.6625,0
4,3,female,22.0,1,1,12.2875,1


In [10]:
mean_age = filtered_titanic_train["Age"].mean()

filtered_titanic_train["Age"].fillna(value=mean_age, inplace=True)
filtered_titanic_test["Age"].fillna(value=mean_age, inplace=True)

In [11]:
filtered_titanic_train = pd.get_dummies(filtered_titanic_train, columns=["Sex"])
filtered_titanic_test  = pd.get_dummies(filtered_titanic_test,  columns=["Sex"])

In [12]:
print("\nCheck for NULL values in each column in training data \n")
print(filtered_titanic_train.isnull().sum(axis = 0))

print("\nCheck for NULL values in each column in testing data \n")
print(filtered_titanic_test.isnull().sum(axis = 0))


Check for NULL values in each column in training data 

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Survived      0
Sex_female    0
Sex_male      0
dtype: int64

Check for NULL values in each column in testing data 

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          1
Survived      0
Sex_female    0
Sex_male      0
dtype: int64


In [14]:
filtered_titanic_test = filtered_titanic_test.dropna()

In [15]:
features = ["Pclass", "Sex_female", "Sex_male", "Age", "SibSp", "Parch", "Fare"]
target = "Survived"

In [16]:
titanic_train_X = filtered_titanic_train[features]
titanic_train_Y = filtered_titanic_train[target]

titanic_test_X = filtered_titanic_test[features]
titanic_test_Y = filtered_titanic_test[target]

## Training Models

#### Decision Tree Model

In [17]:
decision_tree_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
decision_tree_model = decision_tree_model.fit(titanic_train_X, titanic_train_Y)
print("Accuracy: {:.4f}%".format(decision_tree_model.score(titanic_test_X, titanic_test_Y) * 100))

Accuracy: 79.6163%


In [19]:
decision_tree_model.score(titanic_test_X, titanic_test_Y) * 100

79.61630695443645

In [None]:
evaluate_model(decision_tree_model, titanic_test_X, titanic_test_Y)

#### Naive Bayes Model

In [None]:
naive_bayes_model = GaussianNB()
naive_bayes_model = naive_bayes_model.fit(titanic_train_X, titanic_train_Y)
print("Accuracy: {:.4f}%".format(naive_bayes_model.score(titanic_test_X, titanic_test_Y) * 100))

## Evaluation Metrics

In [None]:
def evaluate_model(model, X, y):
    y_prediction = model.predict(X)

    accuracy = accuracy_score(y, y_prediction)
    precision = precision_score(y, y_prediction)
    recall = recall_score(y, y_prediction)
    f_0_5 = fbeta_score(y, y_prediction, beta=0.5)
    f_1 = fbeta_score(y, y_prediction, beta=1)

    print("Accuracy: {:.4f}%".format(accuracy * 100))
    print("Precision: {:.4f}%".format(precision * 100))
    print("Recall: {:.4f}%".format(recall * 100))
    print("F 0.5 Score: {:.4f}%".format(f_0_5 * 100))
    print("F 1 Score: {:.4f}%".format(f_1 * 100))

    print("\nClassification Report\n")

    print(classification_report(y, y_prediction))

In [None]:
def save_confusion_matrix(model, X, y):    
    y_prediction = model.predict(X)
    categories = ["No", "Yes"]
    group_names = ["True Negative", "False Positive", "False Negative", "True Positive"]
    matrix = confusion_matrix(y, y_prediction)
    group_counts = ["{0:0.0f}".format(value) for value in matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in matrix.flatten() / np.sum(matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(matrix, annot=labels, fmt="", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Real")
    plt.ylabel("Predicted")
    plt.show()

#### Decision Tree

In [None]:
evaluate_model(decision_tree_model, titanic_train_X, titanic_train_Y)

In [None]:
evaluate_model(decision_tree_model, titanic_test_X, titanic_test_Y)

In [None]:
save_confusion_matrix(decision_tree_model, titanic_train_X, titanic_train_Y)

In [None]:
save_confusion_matrix(decision_tree_model, titanic_test_X, titanic_test_Y)

#### Naive Bayes

In [None]:
evaluate_model(naive_bayes_model, titanic_train_X, titanic_train_Y)

In [None]:
evaluate_model(naive_bayes_model, titanic_test_X, titanic_test_Y)

In [None]:
save_confusion_matrix(naive_bayes_model, titanic_train_X, titanic_train_Y)

In [None]:
save_confusion_matrix(naive_bayes_model, titanic_test_X, titanic_test_Y)

## Additional Models