# Complex Handling of Missing Values

Handle all missing values so that you are left with a dataset with no missing values

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

some of these records have our target value missing for the purpose of predicting without knowing the actual value, so we will remove those for now. 

In [None]:
df = pd.read_csv('data/titanic.csv')

In [None]:
# df_na.isnull().sum() / len(df_na) * 100

df.isnull().sum() / len(df) * 100

**Deck**

With 77% missing, there is not much we can get from those. 

In [None]:
# df_na = df_na.drop(['Cabin'], axis=1)
# df_na.columns.values

df = df.drop(['deck'], axis=1)
df.columns.values

In [None]:
cutoff = 18

conditions = [
    (df['age'] >= cutoff), # adult
    (df['age'] < cutoff), # child
]
choices = ['False', 'True']
df['child'] = np.select(conditions, choices, default='NaN')

In [None]:
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')

**Embarked and Embark Town**

Due to the limited number of missing values in these two columns, we can just drop the rows. 

In [None]:
df = df[~df.embarked.isnull()]

**Age** 

Now, `age` is the only variable showing missing values.

However, we computed `child` from `age` so we must have a filler in there indicating an unknown value that is not being identified as missing.  We will fix that so it is accurately represented first. 


1. Let's see what value is representing null in `child`

In [None]:
df.child.value_counts()

2. Let's change that value to be a true NaN

In [None]:
df = df.replace('NaN', np.nan)
df.isnull().sum()

That looks better!

**Imputing age through estimation**

1. Estimate `child`

    Identify the passenger as more likely to be an adult or child based on finding from hypotheses

    - if 'Mrs' then adult
    - if parch > 2, then the passenger is an adult with children (as opposed to a child with > 2 parents.)
    - if name title is 'Miss' and sibsp > 0, then the passenger is a child given that 'Miss' for an adult would indicate no spouse.
    - if sibsp > 1 then the passenger is a child, given that the likelihood of an adult traveling with a spouse and a sibling, only siblings, or having 2 spouses is low.
    - if sibsp <= 1 and Parch == 0 then the passenger is an adult, traveling alone or with spouse.

In [None]:
df.head()

2. Estimate `age` by taking the median age of the `child` class.

In [None]:
df['Title'] = df_na['Name'].str.extract('([A-Za-z]+)\.')

In [None]:
# Compare title with age class
pd.crosstab(df_na.Title, df_na.IsChild).style.background_gradient(cmap='YlOrRd')

df_na.IsChild.value_counts()

##### Takeaways:

1. Miss: explore more with Sibsp
2. if 'Master' => child
3. if all others => adult

1. Explore Miss with SibSp

In [None]:
# Explore Miss with SibSp
df_na[df_na.Title=='Miss'].groupby(['SibSp','IsChild'])['Age'].agg(['count', 'mean', 'median'])
# pd.crosstab(df_na.SibSp, df_na.IsChild).style.background_gradient(cmap='YlOrRd')

#### Conclusion

1. if `SibSp > 0 & Title=='Miss'`, then we will estimate a child age
2. If `Title == 'Master'`, then we will estimate a child age
3. Otherwise we estimate an adult age

In [None]:
# We will need this for pre-processing
def estIsChild(df, ix=0):
    if ~df.IsChild.isnull()[ix]:
        return df.IsChild[ix]
    elif (df.Title[ix]=='Miss' and df.SibSp[ix]>0) or (df.Title[ix]=='Master'):
        return 'True'
    else:
        return 'False'

def fillIsChild(df):
    isChild = []
    for i in range(len(df_na)):
        isChild.append(estIsChild(df_na, ix=i))
    df['IsChild'] = pd.DataFrame(isChild)
    return df    

In [None]:
df_na = fillIsChild(df_na)

In [None]:
df_na.isnull().sum()

#### 2. Estimate 'Age'

In [None]:
def impute_age(df):
    byChild = df.groupby('IsChild')
    df.Age = byChild.Age.transform(lambda x: x.fillna(x.median()))
    return df

df_na = impute_age(df_na)

In [None]:
df_na.isnull().sum()

### Embarked Missing Values

We will impute the new values in pre-processing. 

For now, we will explore without those values, as the 2 missing values are not siginificant enough to affect our exploration and discoveries. 

## Needed in Final Model

In [None]:
# We will need this for pre-processing

def extractTitle(df, col, newcol = 'Title'):
    df[newcol] = df[col].str.extract('([A-Za-z]+)\.')
    return df

def getAgeClass(df, col, newcol = 'IsChild', cutoff = 18):
    conditions = [
        (df[col] >= cutoff), # adult
        (df[col] < cutoff), # child
    ]
    choices = ['False', 'True']
    df[newcol]= np.select(conditions, choices, default='NaN')
    return df


def estIsChild(df, ix=0):
    if ~df.IsChild.isnull()[ix]:
        return df.IsChild[ix]
    elif (df.Title[ix]=='Miss' and df.SibSp[ix]>0) or (df.Title[ix]=='Master'):
        return 'True'
    else:
        return 'False'

def fillIsChild(df):
    isChild = []
    for i in range(len(df_na)):
        isChild.append(estIsChild(df_na, ix=i))
    df['IsChild'] = pd.DataFrame(isChild)
    return df    

def impute_age(df):
    byChild = df.groupby('IsChild')
    df.Age = byChild.Age.transform(lambda x: x.fillna(x.median()))
    return df
