# Titanic - missing data

#### In this Jupyter Notebook file, we will work with a modified version of the Titanic dataset to apply different missing data techniques

In [42]:
# Step 1: import necessary libraries

import pandas as pd
import numpy as np
import missingno as msno
from sklearn.impute import SimpleImputer
from fancyimpute import KNN
from fancyimpute import IterativeImputer

In [43]:
# Step 2: read csv

df = pd.read_csv(
    'C:/Users/Leo/Desktop/Data Science Journey/Python/Data Sets/Pandas/Titanic - missing data/titanic_missing_data.csv',
    sep=';', na_values='#N/A')
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked
0,165,0.0,3.0,"Panula, Master. Eino Viljami",male,1.0,S


In [44]:
df.dtypes

PassengerId      int64
Survived       float64
Pclass         float64
Name            object
Sex             object
Age            float64
Embarked        object
dtype: object

In [45]:
# Step 3: Lets determine which columns are affected by missing data

df.isnull().sum()
#df.isnull().mean()*100

PassengerId      0
Survived         5
Pclass           4
Name             0
Sex              0
Age            178
Embarked         2
dtype: int64

In [46]:
# Step 4: Lets visualize missing data to determine the type of missingness

# msno.bar(df)
# msno.matrix(df) # --> missing data appears to be MCAR

In [47]:
# msno.heatmap(df) # --> no correlation between missing values in the columns

In [48]:
# msno.dendrogram(df)

In [49]:
# Step 5: Lets impute missing data

In [50]:
# 5.1. high values in Age  
# df['Age'].describe() # --> incorrect maximun values 

# df.sort_values('Age', ascending=False) # --> age = 999 & 998 & 997 
#df['Age'][df['Age'] == 999] = np.nan
#df['Age'][df['Age'] == 998] = np.nan
#df['Age'][df['Age'] == 997] = np.nan
df.sort_values('Age', ascending=False).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked
886,860,0.0,3.0,"Razi, Mr. Raihed",male,999.0,C
879,827,0.0,3.0,"Lam, Mr. Len",male,998.0,S
883,840,1.0,1.0,"Marechal, Mr. Pierre",male,997.0,C
688,631,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,S
687,852,0.0,3.0,"Svensson, Mr. Johan",male,74.0,S


In [51]:
# 5.2. Age = 0
#df[df['Age'] == 0] # --> have age = 0
#df['Age'][df['Age'] == 0] = np.nan
df[df['Age'] == 0] # --> no more age = 0

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked
101,551,1.0,1.0,"Thayer, Mr. John Borland Jr",male,0.0,C
132,45,1.0,3.0,"Devaney, Miss. Margaret Delia",female,0.0,Q
217,555,1.0,3.0,"Ohman, Miss. Velin",female,0.0,S
272,344,0.0,2.0,"Sedgwick, Mr. Charles Frederick Waddington",male,0.0,S


In [52]:
# 5.3. Impute Age using mean

#df[np.isnan(df['Age'])] #Lets take a look at rows with NaN in Age
#age_mean = df['Age'].mean()
#df_age_mean = df.fillna({'Age': age_mean})
#df_age_mean[df_age_mean['Age'].isnull()] # --> have no missing data in Age in DataFrame df_age_mean

In [53]:
# Step 6: Lets delete rows with missing data in columns "Survived" or "Pclass"

#df.isnull().sum() --> 5 NaN in Survived and 4 in Pclass
df.dropna(subset=['Survived', 'Pclass'], how='any', inplace=True)
df.isnull().sum() # --> No NaN in Survived or Pclass 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            178
Embarked         2
dtype: int64