In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Titanic Dataset

Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck. 

### Train set
For the training set, we provide the outcome (also known as the “ground truth”) for each passenger. Your model will be based on “features” like passengers’ gender and class. 

### Test dataset
The test set should be used to see how well your model performs on unseen data. For the test set, we do not provide the ground truth for each passenger. It is your job to predict these outcomes. For each passenger in the test set, use the model you trained to predict whether or not they survived the sinking of the Titanic.

### Example submission
We also include gender_submission.csv, a set of predictions that assume all and only female passengers survive, as an example of what a submission file should look like.

In [33]:
train=pd.read_csv('Archivos/train.csv')
display(train.shape)
display(train.head())

(891, 12)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
#AGE
ageNan=train.Age.isna().sum()
ageNanprop=(ageNan/train.shape[0])*100
print('Total registers =',train.shape[0])
print('Total missing Age values =',ageNan)
print('Ratio = %.2f' %ageNanprop)

Total registers = 891
Total missing Age values = 177
Ratio = 19.87


In [35]:
#Cabins
cabinNan=train.Cabin.isna().sum()
cabinNanprop=(cabinNan/train.shape[0])*100
print('Total registers =',train.shape[0])
print('Total missing Age values =',cabinNan)
print('Ratio = %.2f' %cabinNanprop)

Total registers = 891
Total missing Age values = 687
Ratio = 77.10


In [36]:
nanvals=train.isna().sum()
nanvals

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [37]:
#Registers with NaN values in Embarked
train.loc[train.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [38]:
#Just dropping Nan columns, same number of registers, difference in columns
deletedtrain=train.dropna(axis=1)
display(deletedtrain.shape)
deletedtrain.head()

(891, 9)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05


## MICE

Multivariate imputation by chained equations (MICE), sometimes called “fully conditional specification” or “sequential regression multiple imputation” has emerged in the statistical literature as one principled method of addressing missing data. Creating multiple imputations, as opposed to single imputations, accounts for the statistical uncertainty in the imputations. In addition, the chained equations approach is very flexible and can handle variables of varying types (e.g., continuous or binary) as well as complexities such as bounds or survey skip patterns.

https://s3.amazonaws.com/assets.datacamp.com/production/course_17404/slides/chapter4.pdf

https://www.youtube.com/watch?v=zX-pacwVyvU

https://stackoverflow.com/questions/53322182/imputation-on-the-test-set-with-fancyimpute

https://medium.com/ibm-data-science-experience/missing-data-conundrum-exploration-and-imputation-techniques-9f40abe0fd87

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074241/

### SK learn's Iterative Imputer

The IterativeImputer class models each feature with missing values as a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: at each step, a feature column is designated as output y and the other feature columns are treated as inputs X. A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. The results of the final imputation round are returned.

In [39]:
from fancyimpute import IterativeImputer as MICE
MICE_imputer = MICE()

In [40]:
display(train.Age.isnull().sum())
train.head()

177

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
#changing categorical to numerical to include in the model
train.loc[train['Sex']=='male','Sex']=0
train.loc[train['Sex']=='female','Sex']=1

In [42]:
train_MICE = train[['Age','Sex','Fare','Pclass','Survived']].copy(deep=True)

In [43]:
train_MICE.iloc[:, :] = MICE_imputer.fit_transform(train_MICE)

In [44]:
#showcasing that the missing values have been filled
display(train_MICE.Age.isnull().sum())
train_MICE.head()

0

Unnamed: 0,Age,Sex,Fare,Pclass,Survived
0,22.0,0.0,7.25,3.0,0.0
1,38.0,1.0,71.2833,1.0,1.0
2,26.0,1.0,7.925,3.0,1.0
3,35.0,1.0,53.1,1.0,1.0
4,35.0,0.0,8.05,3.0,0.0


In [14]:
#general mean
display(str(train.Age.mean()) + ' vs ' +str(train_MICE.Age.mean()))
display(str(train.Age.median()) + ' vs ' +str(train_MICE.Age.median()))

'29.69911764705882 vs 29.29539143586743'

'28.0 vs 26.7247499072374'

In [15]:
#male vs female
c1=train.loc[train.Sex==0]
c2=train.loc[train.Sex==1]
d1=train_MICE.loc[train_MICE.Sex==0]
d2=train_MICE.loc[train_MICE.Sex==1]
display('male mean '+str(c1.Age.mean())+' vs '+ str(d1.Age.mean()))
display('male median'+str(c1.Age.median())+' vs '+ str(d1.Age.median()))
display('female mean '+str(c2.Age.mean())+' vs '+ str(d2.Age.mean()))
display('female median '+str(c2.Age.median())+' vs '+ str(d2.Age.median()))

'male mean 30.72664459161148 vs 30.398018301363'

'male median29.0 vs 27.0'

'female mean 27.915708812260537 vs 27.2692267817561'

'female median 27.0 vs 25.34825581397134'

## MICE 2

Common answer: 

This is now deprecated and sklearn’s IterativeImputer should be used:

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer

In [45]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer as MICE2

In [17]:
MICE_imputer2 = MICE2()

In [18]:
display(train.Age.isnull().sum())
train.head()

177

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [19]:
#building a new dataframe with the selected characteristics and leaving out some like name, cabin, etc.
train_MICE2 = train[['Age','Sex','Fare','Pclass','Survived']].copy(deep=True)
train_MICE2.iloc[:, :] = MICE_imputer2.fit_transform(train_MICE2)

In [20]:
#showcasing that the missing values have been filled
display(train_MICE2.Age.isnull().sum())
train_MICE2.head()

0

Unnamed: 0,Age,Sex,Fare,Pclass,Survived
0,22.0,0.0,7.25,3.0,0.0
1,38.0,1.0,71.2833,1.0,1.0
2,26.0,1.0,7.925,3.0,1.0
3,35.0,1.0,53.1,1.0,1.0
4,35.0,0.0,8.05,3.0,0.0


In [21]:
#general mean
display(str(train.Age.mean()) + ' vs ' +str(train_MICE2.Age.mean()))
display(str(train.Age.median()) + ' vs ' +str(train_MICE2.Age.median()))
#male vs female
c1=train.loc[train.Sex==0]
c2=train.loc[train.Sex==1]
d1=train_MICE2.loc[train_MICE2.Sex==0]
d2=train_MICE2.loc[train_MICE2.Sex==1]
display('male mean '+str(c1.Age.mean())+' vs '+ str(d1.Age.mean()))
display('male median'+str(c1.Age.median())+' vs '+ str(d1.Age.median()))
display('female mean '+str(c2.Age.mean())+' vs '+ str(d2.Age.mean()))
display('female median '+str(c2.Age.median())+' vs '+ str(d2.Age.median()))

'29.69911764705882 vs 29.29539143586743'

'28.0 vs 26.7247499072374'

'male mean 30.72664459161148 vs 30.398018301363'

'male median29.0 vs 27.0'

'female mean 27.915708812260537 vs 27.2692267817561'

'female median 27.0 vs 25.34825581397134'