## Datacamp style to impute missing value

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = sns.load_dataset('titanic')
print(df.head())

In [None]:
print("missing values", df.isnull().sum().sort_values(ascending=False))

### visualize missing values

In [None]:
import missingno as msno
import matplotlib.pyplot as plt

print(df.isna().sum()) # print missing values

# Visualize missingness
msno.matrix(df)
plt.show()

## Check if missing values of a column have relationship with other columns

`Lets say we have 177 values for age are missing. Now, we need to check, if fare is too high or low where values are missing or some other relation`

`In simple terms, we need to check if there is any relationship between column with missing values and other columns`

In [None]:
# Isolate missing and non missing values of inv_amount
missing_age = df[df['age'].isna()]
age = df[~df['age'].isna()]

print(missing_age[['age', 'fare']].describe())
print(age[['age', 'fare']].describe())

# No useful relationship found. It means, age missing values are totally random and do not depend on other columns.

#### Drop rows of a column, if missing values are less than 5%

In [None]:
# Find how many rows, fall into 5% of our data
threshold = int(len(df) * 0.05)

cols_with_missing_values = df.columns[df.isna().sum() > 0]
print("cols_with_missing_values: ", cols_with_missing_values)

#Â Create a filter
df_cols_with_missing_values = df[cols_with_missing_values]
cols_to_drop = cols_with_missing_values[df_cols_with_missing_values.isna().sum() <= threshold] # columns containing, less than 5% missing values
print("cols_to_drop: ", cols_to_drop)

# Drop missing values for columns below the threshold
df.dropna(subset=cols_to_drop, inplace=True)

print(df.isna().sum()) # rows of "embarked" and "embark town" are dropped

## Box Plot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6)) # Create a figure with 1 row and 2 columns of subplots

# Box plot of age by 'who'
sns.boxplot(data=df, x="who", y="age", ax=axes[0])
axes[0].set_title("Age by Who")

# Box plot of age only
sns.boxplot(data=df, y="age", ax=axes[1])
axes[1].set_title("Age Overall")

plt.tight_layout()
plt.show()

In [None]:
# Get median of age by who
who_age = df.groupby('who')['age'].median()
print(who_age) # median is 30 for man

print(df['age'].median()) # if we use this, man will get 28 for missing values instead of 30

In [None]:
# Convert who_age to a dictionary
prices_dict = who_age.to_dict()
print(prices_dict)

## fill missing value by median of age, as per category of who

In [None]:
df["age"] = df["age"].fillna(df["who"].map(prices_dict)) # Map the dictionary to missing values of age by who

In [None]:
print(df.isna().sum())

In [None]:
df.info()