# This project will predict survivors of the Titanic.

## Importing the libraries needed for the program.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import sklearn as sk
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Importing the CSV Titanic dataset.

In [2]:
data = pd.read_csv("C:/Users/gavin/Jupyter/Datasets/Titanic_Data.csv")

## Begin cleaning the dataset by dropping unnecessary columns for predicting survivors.

In [3]:
data = data.drop(["home.dest","body","boat","embarked","fare","ticket","name",'cabin'], axis=1)

## Cleaning data set continues by removing any rows with missing values.

In [4]:
data = data.dropna(subset =["age"])

## This prints the shape of our dataset after cleaning it, 272 rows by 8 columns.

In [5]:
data.shape

(1046, 6)

## 10 rows are printed to give a visual of the current state of our dataset.

In [6]:
data.head(10)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch
0,1,1,female,29.0,0,0
1,1,1,male,0.9167,1,2
2,1,0,female,2.0,1,2
3,1,0,male,30.0,1,2
4,1,0,female,25.0,1,2
5,1,1,male,48.0,0,0
6,1,1,female,63.0,1,0
7,1,0,male,39.0,0,0
8,1,1,female,53.0,2,0
9,1,0,male,71.0,0,0


## Here are the statistics for each column with values.

In [7]:
data.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch
count,1046.0,1046.0,1046.0,1046.0,1046.0
mean,2.207457,0.408222,29.881135,0.502868,0.42065
std,0.841497,0.49174,14.4135,0.912167,0.83975
min,1.0,0.0,0.1667,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0
50%,2.0,0.0,28.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,1.0
max,3.0,1.0,80.0,8.0,6.0


## Importing label encoder from sklearn library, & naming it. This allows us to convert data to workable values.

In [8]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

## Here we encode the sex column to convert it to int values.

In [9]:
data.iloc[:,2]=labelencoder.fit_transform(data.iloc[:,2].values)

## We can visually see the sex column converted to 0, 1 values.

In [10]:
print(data['sex'].unique())

[0 1]


## We split the data into X & Y datasets to prepare for training & testing.

In [11]:
X = data.iloc[:, [0,2,3,4,5]].values 
Y = data.iloc[:, 1].values 

## Import train & test from sklearn library & split the data again into 80% training, 20% testing.

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Now we implement our Decision Tree algorithm.

In [13]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train, Y_train)

## Here we can test our algorithm for accuracy.

In [14]:
 print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train)*100,'%')

[5]Decision Tree Classifier Training Accuracy: 91.14832535885168 %


## Our traning accuracy is 91%. Let's validate this by tuning hyperparamers with different combinations to find the best result.

In [15]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['entropy', 'gini']
}

grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_*100,'%')

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best Score: 78.58782435129741 %


## The best score from the validation set is 78.59%. This suggests the model may be overfitting the training data & not memorizing enough of it. Let's train a new model using the best parameters from the validation set.

In [16]:
tree_optimal = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=5, min_samples_split=2)
tree_optimal.fit(X_train, Y_train)

In [17]:
Y_pred = tree_optimal.predict(X_test)
print("Test Accuracy:", accuracy_score(Y_test, Y_pred)*100,'%')
print("Test Precision:", precision_score(Y_test, Y_pred)*100,'%')
print("Test Recall:", recall_score(Y_test, Y_pred)*100,'%')
print("Test F1-score:", f1_score(Y_test, Y_pred)*100,'%')

Test Accuracy: 80.47619047619048 %
Test Precision: 71.42857142857143 %
Test Recall: 77.92207792207793 %
Test F1-score: 74.53416149068323 %


## Test Accuracy: 80.476: The Decision Tree Classifier correctly classifies 80.48% of the test instances. It looks like the model is doing a decent job at predicting the survivors.

## Test Precision: 71.428: Out of all the instances predicted as survivors, approximately 71.43% were actual survivors.

## Test Recall: 77.922: Out of all the actual survivors in the test set, 77.92% were correctly predicted as survivors by our model. It may miss some survivors when detecting them.

## Test F1-score: 74.534: This is the average of precision and recall. An F1-score of 74.53% indicates that our model is doing a good job of balancing precision and recall, but still has some room for improvement.

## Overall, the model is performing reasonably well, but there's still much to improve on. 
## We could try: 
## Feature engineering: Extracting more informative features from the dataset to improve the model's performance.
## Hyperparameter tuning: Trying out different hyperparameters or using more advanced techniques like Bayesian optimization.
## Ensemble methods: Combining mutiple models to create more accurate predictions.
## * Coming Soon *