## Tackling Missing Data: Imputing With Model

In [3]:
import pandas as pd
wdi = pd.read_pickle('wdi.pkl')
numeric_cols = wdi.select_dtypes(include ='number').columns

In [4]:
wdi.head()

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,country_category,is_region
0,Afghanistan,97.7,1.213235,9.0,11.777236,519.884773,2058.383832,65.656,62.701,64.13,55.595993,36296400.0,,14.09,10.416,11.184,2017,DEVELOPING,0
1,Albania,100.0,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148,76.601,78.333,104.870693,2873457.0,,12.563,14.59,13.75,2017,DEVELOPING,0
2,Algeria,100.0,9.130677,10.0,6.380329,4111.29411,11737.409353,77.735,75.307,76.499,17.377715,41389198.0,,21.114,10.021,11.996,2017,DEVELOPING,0
3,Angola,41.962894,19.07925,6.0,2.791503,4095.812942,7310.901738,63.252,57.677,60.379,23.916538,29816748.0,,7.467,6.769,7.119,2017,DEVELOPING,0
4,Arab World,90.283638,27.36062,9.0,5.561266,6108.58822,14562.367966,73.493366,69.92125,71.622526,36.669804,411898967.0,,20.361396,8.097088,10.641313,2017,,1


### Using the IterativeImputer

In [5]:
from sklearn.experimental import enable_iterative_imputer

In [6]:
from sklearn.impute import IterativeImputer

#### IterativeImputer follows the same 3 steps as SimpleImputer
###### 1) Create IterativeImputer Object
###### 2) fit_transform the object with the dataframe and assign to variable
###### 3) Update the dataframe with the varible 

In [12]:
## Will only focus on the max_value or min_value parameters. You can see parameters using shift + tab on the IterativeImputer()
## Returns the min and max values from the dataframe for their perspective columns
# We are essentially getting the range by grabbing the min and max values
# See the mins and maxs below

iter_imp = IterativeImputer(min_value=wdi[numeric_cols].min(), max_value=wdi[numeric_cols].max())

In [8]:
wdi[numeric_cols].min()

access_to_electricity_pct              9.300000
atms_per_100000                        1.213235
compulsory_education_years             5.000000
health_expenditure_pct_of_gdp          2.138482
gdp_per_capita_usd                   271.752044
gdp_per_capita_ppp                   773.571858
life_expectancy_female                54.354000
life_expectancy_male                  49.837000
life_expectancy                       52.240000
population_density                     2.004286
population                        101998.000000
alcohol_consumption_per_capita         0.003000
unemployment_rate_female               0.369000
unemployment_rate_male                 0.052000
unemployment_rate                      0.110000
year                                2017.000000
is_region                              0.000000
dtype: float64

In [11]:
wdi[numeric_cols].max()

access_to_electricity_pct         1.000000e+02
atms_per_100000                   2.725928e+02
compulsory_education_years        1.600000e+01
health_expenditure_pct_of_gdp     1.700361e+01
gdp_per_capita_usd                1.166543e+05
gdp_per_capita_ppp                1.167865e+05
life_expectancy_female            8.770000e+01
life_expectancy_male              8.230000e+01
life_expectancy                   8.493415e+01
population_density                7.952998e+03
population                        7.591945e+09
alcohol_consumption_per_capita    1.509000e+01
unemployment_rate_female          4.276900e+01
unemployment_rate_male            2.523800e+01
unemployment_rate                 2.707100e+01
year                              2.018000e+03
is_region                         1.000000e+00
dtype: float64

In [13]:
imputed_num = iter_imp.fit_transform(wdi[numeric_cols])

In [14]:
wdi[numeric_cols] = imputed_num

In [15]:
wdi[numeric_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   access_to_electricity_pct       434 non-null    float64
 1   atms_per_100000                 434 non-null    float64
 2   compulsory_education_years      434 non-null    float64
 3   health_expenditure_pct_of_gdp   434 non-null    float64
 4   gdp_per_capita_usd              434 non-null    float64
 5   gdp_per_capita_ppp              434 non-null    float64
 6   life_expectancy_female          434 non-null    float64
 7   life_expectancy_male            434 non-null    float64
 8   life_expectancy                 434 non-null    float64
 9   population_density              434 non-null    float64
 10  population                      434 non-null    float64
 11  alcohol_consumption_per_capita  434 non-null    float64
 12  unemployment_rate_female        434 

In [16]:
# Now check out the alcohol_consumption_per_capita column. The first 5 before the imputer we're NaN values but are now updated
# The values aren't static values we assigned though. Notice how they are all different
# This is due to the IterativeImputer taking data from that row, looking at the mins and maxs we assigned 
# Then through a machine learning model it makes its best estimate
wdi[numeric_cols].head()

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.7,1.213235,9.0,11.777236,519.884773,2058.383832,65.656,62.701,64.13,55.595993,36296400.0,4.113571,14.09,10.416,11.184,2017.0,0.0
1,100.0,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148,76.601,78.333,104.870693,2873457.0,6.463272,12.563,14.59,13.75,2017.0,0.0
2,100.0,9.130677,10.0,6.380329,4111.29411,11737.409353,77.735,75.307,76.499,17.377715,41389198.0,1.526053,21.114,10.021,11.996,2017.0,0.0
3,41.962894,19.07925,6.0,2.791503,4095.812942,7310.901738,63.252,57.677,60.379,23.916538,29816748.0,5.760329,7.467,6.769,7.119,2017.0,0.0
4,90.283638,27.36062,9.0,5.561266,6108.58822,14562.367966,73.493366,69.92125,71.622526,36.669804,411898967.0,1.830477,20.361396,8.097088,10.641313,2017.0,1.0


### Pickle this imputed data

In [17]:
wdi.to_pickle('wdi_imp.pkl')