# Dealing with Missing Values
using scikitlearn

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the dataset
df=sns.load_dataset('titanic')
df.head()

In [None]:
null_tot=df.isnull().sum().sort_values(ascending=False)

In [None]:
df.info()

In [None]:
null_perc=round((null_tot)/len(df)*100,2).sort_values(ascending=False)

#print the result
pd.concat([null_tot,null_perc],axis=1,keys=['Total','Perc'])


The result shows the total number and percentage of missing values in each column of the 'df' DataFrame.
Here is the interpretation:

- 'deck' column has the highest number of missing values with 688 missing values, which accounts for 77.22% of the total data.(which seems this data can be neglected)
- 'age' column has 177 missing values, which accounts for 19.87% of the total data.
- 'embarked' and 'embark_town' columns have 2 missing values each, which accounts for 0.22% of the total data.
- 'survived', 'pclass', 'sex', 'sibsp', and other columns have no missing values.

It is important to handle missing values appropriately before performing any analysis or modeling on the dataset.

# 01 Filling with Pandas

In [28]:
# drop the column
df.drop('deck', axis=1, inplace=True)

In [None]:
# visualize the data
plt.figure(figsize=(8,5))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [26]:
# fill age column with mean
df['age'].fillna(df['age'].mean(),inplace=True)



In [30]:
# filling embarked column with mode
df['embarked'].fillna(df['embarked'].mode()[0],inplace=True)

In [32]:
# filling embarked column with mode
df['embarked'].fillna(df['embarked'].mode()[0],inplace=True)

In [31]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64

# 02 Filling with sklearn simpleimputer

In [36]:
from sklearn.impute import SimpleImputer

In [33]:
data=sns.load_dataset('titanic')

In [34]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [52]:
#impute age column using simpleimputer
imputer=SimpleImputer(strategy='mean')
data['age']=imputer.fit_transform(df[['age']])



In [53]:
data.isnull().sum().sort_values(ascending=False)

deck           688
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

# 03 Multivariate Imputer sklearn

In [54]:
data=sns.load_dataset('titanic')

In [55]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [58]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#impute age column using iterativeimputer
imputer=IterativeImputer()
data['age']=imputer.fit_transform(df[['age']])


In [57]:
data.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

# 04 ffill & bfill

In [59]:
# fill age column using forward fill
data['age'] = data['age'].ffill()

In [60]:
# fill age column using backward fill
data['age'] = data['age'].bfill()

# 05 Using KNN

In [61]:
from sklearn.impute import KNNImputer


In [62]:
# impute age column using knnimputer
imputer=KNNImputer(n_neighbors=5)
data['age']=imputer.fit_transform(df[['age']])

Based on the analysis, we can draw the following conclusions:
- The survival rate of passengers is...
- The age column has missing values, which were imputed using...
- The embarked column has...