# Introduction

## Importing libraries

In [102]:
# Import libraries

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn.model_selection as ms 
from IPython.display import Image
from IPython.core.display import HTML 
import re 
import seaborn as sns
import plotly.express as px 
from plotly.subplots import make_subplots

## Loading data

In [103]:
# I am going to obtain my dataset from my personal drive account

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [104]:
titanic = pd.read_csv('/content/drive/MyDrive/upgrade-hub_Data_Analytics/Modulo1/220620_Titanic/titanic.csv')

## Dataset information


In [105]:
# Amount of information
titanic.shape

(891, 12)

In [106]:
# Types of columns
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [107]:
display(titanic.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [108]:
# Let's get the amount of missing data:
missing_data = titanic.isnull().sum(axis = 0)
missing_data = missing_data[missing_data > 0]
percentage_missing = missing_data / titanic.shape[0]*100
titanic_missing = pd.DataFrame()
titanic_missing["Count (n)"] = missing_data
titanic_missing["Percentage (%)"] = percentage_missing
print(titanic_missing.sort_values(by = "Count (n)", ascending = False))

          Count (n)  Percentage (%)
Cabin           687       77.104377
Age             177       19.865320
Embarked          2        0.224467


In [109]:
# We delete cabin
titanic = titanic.drop(["Cabin"], axis = 1)

In [110]:
# We fill in the missing Age data with the mean value of Age (Age is not correlated to any variables)
titanic.loc[(titanic["Age"].isna()), "Age"] = (titanic["Age"].mean())

In [111]:
# We check for missing data again
missing_data = titanic.isnull().sum(axis = 0)
missing_data = missing_data[missing_data > 0]
percentage_missing = missing_data / titanic.shape[0]*100
titanic_missing = pd.DataFrame()
titanic_missing["Count (n)"] = missing_data
titanic_missing["Percentage (%)"] = percentage_missing
print(titanic_missing.sort_values(by = "Count (n)", ascending = False))

          Count (n)  Percentage (%)
Embarked          2        0.224467


In [112]:
# Features that are not useful to us: Name
titanic = titanic.drop(["Name"], axis = 1)

In [113]:
# We are also going to drop Ticket
titanic = titanic.drop(["Ticket"], axis = 1)

In [114]:
# Let's one hot encode the port of embarkment
pd.get_dummies(titanic["Embarked"])

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [115]:
# We delete the Embarked column and add the one hot encoding
titanic = titanic.join(pd.get_dummies(titanic["Embarked"]))
titanic = titanic.drop(["Embarked"], axis = 1)


In [116]:
# We convert the Sex variable into 1 for female and 0 for male
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0

# Model Building

## Importing libraries

In [125]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

## Dataset preparation

In [118]:
# Let's turn our dataset into a model
X_train, X_test, y_train, y_test = train_test_split(titanic.drop(columns = ["PassengerId", "Survived"]), titanic["Survived"], test_size = 0.2, random_state = 53)

## Model classifiers

In [128]:
clf =  ExtraTreesClassifier(n_estimators = 100, random_state = 13)

## Model fitting

In [129]:
clf.fit(X_train, y_train)

ExtraTreesClassifier(random_state=13)

# Model predictions

In [130]:
#Let's predict the survival of the passengers
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1])

## Results

In [133]:
# Let's compare the predictions with the actual results
confusion_matrix(list(y_test),clf.predict(X_test))

array([[90, 23],
       [17, 49]])

In [134]:
# Let's calculate the accuracy of the model
accuracy_score(list(y_test), clf.predict(X_test))

0.776536312849162