# Setup notebbok

In [34]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

# Process data

### Variable Notes
#### pclass: A proxy for socio-economic status (SES)
- 1st = Upper
- 2nd = Middle
- 3rd = Lower

#### age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

#### sibsp: The dataset defines family relations in this way...
- Sibling = brother, sister, stepbrother, stepsister
- Spouse = husband, wife (mistresses and fiancés were ignored)

#### parch: The dataset defines family relations in this way...
- Parent = mother, father
- Child = daughter, son, stepdaughter, stepson
- Some children travelled only with a nanny, therefore parch=0 for them.

In [2]:
# Load raw dataframes
train_raw_df = pd.read_csv('../Data/train.csv')
test_raw_df = pd.read_csv('../Data/test.csv')
total_raw_df = pd.read_excel('../Data/Complete_dataset.xls')

### Exploratory Data Analysis

In [3]:
# See how many records and variables
num_pass, num_var = total_raw_df.shape[0], total_raw_df.shape[1]
print(f'Number of passengers = {num_pass}\nNumber of variables = {num_var}')

Number of passengers = 1309
Number of variables = 14


In [39]:
total_raw_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
# Have a global look to dataset
total_raw_df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [6]:
# Look for missing values
for el in total_raw_df.columns:
    num_null = total_raw_df[el].isnull().sum()
    print(f'Column {el} has {num_null} nulls')

Column pclass has 0 nulls
Column survived has 0 nulls
Column name has 0 nulls
Column sex has 0 nulls
Column age has 263 nulls
Column sibsp has 0 nulls
Column parch has 0 nulls
Column ticket has 0 nulls
Column fare has 1 nulls
Column cabin has 1014 nulls
Column embarked has 2 nulls
Column boat has 823 nulls
Column body has 1188 nulls
Column home.dest has 564 nulls


### Visualize some data

#### Passenger composition

In [33]:
# By sex
fig = px.pie(
    total_raw_df, 
    names=total_raw_df['sex'].map({'male': 'Male', 'female': 'Female'}),
    title='Passenger composition - by sex',
    # hole=0.5,
    color_discrete_sequence = ['#3376FF', '#FF94ED'],
)
fig.update_layout(
    font=dict(size=18)
)
fig.show()

In [32]:
# By class
fig = px.pie(
    total_raw_df, 
    names=total_raw_df['pclass'].map({1: '1st class', 2:'2nd class', 3: '3rd class'}), 
    title='Passenger composition - by ticket class',
    # hole=0.5,
    # color_discrete_sequence = ['#3376FF', '#FF94ED'],
)
fig.update_layout(
    font=dict(size=18),
)
fig.show()

##### Age ranges:
- < 1 = newborn
- 1-16 = kid
- 17-25 = young adult
- 26-40 = adult
- 41-60 = middle aged man
- 61-70 = old adult
- +71 = elderly

In [40]:
# Convert number to age range
bins = [1, 16, 25, 40, 60, 70, np.inf]
categories = ['Newbor', 'Kid', 'Young adult', 'Adult', 'Middle aged', 'Old adult', 'Elderly']
total_raw_df['age'] = pd.cut(total_raw_df['age'], bins, categories)

In [41]:
print(total_raw_df.ty)

      pclass  survived                                             name  \
0          1         1                    Allen, Miss. Elisabeth Walton   
1          1         1                   Allison, Master. Hudson Trevor   
2          1         0                     Allison, Miss. Helen Loraine   
3          1         0             Allison, Mr. Hudson Joshua Creighton   
4          1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
...      ...       ...                                              ...   
1304       3         0                             Zabour, Miss. Hileni   
1305       3         0                            Zabour, Miss. Thamine   
1306       3         0                        Zakarian, Mr. Mapriededer   
1307       3         0                              Zakarian, Mr. Ortin   
1308       3         0                               Zimmerman, Mr. Leo   

         sex           age  sibsp  parch  ticket      fare    cabin embarked  \
0     female  (25.0

In [None]:
# By age
fig = px.pie(
    total_raw_df, 
    names=total_raw_df['pclass'].map({1: '1st class', 2:'2nd class', 3: '3rd class'}), 
    title='Passenger composition - by ticket class',
    # hole=0.5,
    # color_discrete_sequence = ['#3376FF', '#FF94ED'],
)
fig.update_layout(
    font=dict(size=18)
)
fig.show()

#### Passenger survival

In [19]:
# Passenger survival
fig = px.pie(
    total_raw_df, 
    names=total_raw_df['survived'].map({0: 'Dead', 1:'Survived'}), 
    title='Passenger survival',
    # hole=0.5,
    color_discrete_sequence = ['#e30909', '#1ddb4c'],
)
fig.update_layout(
    font=dict(size=18)
)
fig.show()

Idea: creare delle categorie divis per sesso, classe, età, e vedere come cambia la mortalità