## Tackling Missing Data: Imputing Missing Data

In [2]:
import pandas as pd
wdi = pd.read_pickle('wdi.pkl')

In [3]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 406 non-null    float64
 3   compulsory_education_years      412 non-null    float64
 4   health_expenditure_pct_of_gdp   427 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              432 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  215 

#### Get all numeric columns

In [4]:
wdi_numeric = wdi.select_dtypes(include = 'number')

In [5]:
wdi_numeric.head()

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.7,1.213235,9.0,11.777236,519.884773,2058.383832,65.656,62.701,64.13,55.595993,36296400.0,,14.09,10.416,11.184,2017,0
1,100.0,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148,76.601,78.333,104.870693,2873457.0,,12.563,14.59,13.75,2017,0
2,100.0,9.130677,10.0,6.380329,4111.29411,11737.409353,77.735,75.307,76.499,17.377715,41389198.0,,21.114,10.021,11.996,2017,0
3,41.962894,19.07925,6.0,2.791503,4095.812942,7310.901738,63.252,57.677,60.379,23.916538,29816748.0,,7.467,6.769,7.119,2017,0
4,90.283638,27.36062,9.0,5.561266,6108.58822,14562.367966,73.493366,69.92125,71.622526,36.669804,411898967.0,,20.361396,8.097088,10.641313,2017,1


In [6]:
wdi_numeric.columns

Index(['access_to_electricity_pct', 'atms_per_100000',
       'compulsory_education_years', 'health_expenditure_pct_of_gdp',
       'gdp_per_capita_usd', 'gdp_per_capita_ppp', 'life_expectancy_female',
       'life_expectancy_male', 'life_expectancy', 'population_density',
       'population', 'alcohol_consumption_per_capita',
       'unemployment_rate_female', 'unemployment_rate_male',
       'unemployment_rate', 'year', 'is_region'],
      dtype='object')

In [14]:
numeric_cols = wdi_numeric.columns

#### Get all non numeric columns

In [12]:
cat_cols = wdi.select_dtypes(exclude = 'number').columns

#### We now have both numeric and non numeric columns and we can work on them seperatly

In [13]:
cat_cols

Index(['country_name', 'country_category'], dtype='object')

In [15]:
numeric_cols

Index(['access_to_electricity_pct', 'atms_per_100000',
       'compulsory_education_years', 'health_expenditure_pct_of_gdp',
       'gdp_per_capita_usd', 'gdp_per_capita_ppp', 'life_expectancy_female',
       'life_expectancy_male', 'life_expectancy', 'population_density',
       'population', 'alcohol_consumption_per_capita',
       'unemployment_rate_female', 'unemployment_rate_male',
       'unemployment_rate', 'year', 'is_region'],
      dtype='object')

In [16]:
wdi[numeric_cols]

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.700000,1.213235,9.0,11.777236,519.884773,2058.383832,65.656000,62.701000,64.130000,55.595993,3.629640e+07,,14.090000,10.416000,11.184000,2017,0
1,100.000000,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148000,76.601000,78.333000,104.870693,2.873457e+06,,12.563000,14.590000,13.750000,2017,0
2,100.000000,9.130677,10.0,6.380329,4111.294110,11737.409353,77.735000,75.307000,76.499000,17.377715,4.138920e+07,,21.114000,10.021000,11.996000,2017,0
3,41.962894,19.079250,6.0,2.791503,4095.812942,7310.901738,63.252000,57.677000,60.379000,23.916538,2.981675e+07,,7.467000,6.769000,7.119000,2017,0
4,90.283638,27.360620,9.0,5.561266,6108.588220,14562.367966,73.493366,69.921250,71.622526,36.669804,4.118990e+08,,20.361396,8.097088,10.641313,2017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,100.000000,25.322499,10.0,5.917897,2566.596950,7771.189420,79.442000,71.207000,75.317000,308.125246,9.554040e+07,8.66000,1.831000,2.142000,1.993000,2018,0
430,100.000000,24.735190,10.0,,3562.330943,6472.121785,75.595000,72.263000,73.895000,758.984551,4.569087e+06,,41.849998,22.393999,26.256001,2018,0
431,89.588551,40.949620,10.0,9.848779,11385.675227,17025.827360,74.871815,70.398743,72.563282,59.617978,7.591945e+09,6.18124,5.580954,5.271615,5.392213,2018,1
432,39.812622,11.554664,7.0,4.934843,1516.390661,3607.304442,66.447000,60.533000,63.510000,23.341479,1.735182e+07,6.54000,12.448000,10.614000,11.500000,2018,0


## fillna()

In [18]:
# columns like alcohol_consumption_per_capita are now set to -999. Compare above and below to view change
wdi[numeric_cols].fillna(-999)

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.700000,1.213235,9.0,11.777236,519.884773,2058.383832,65.656000,62.701000,64.130000,55.595993,3.629640e+07,-999.00000,14.090000,10.416000,11.184000,2017,0
1,100.000000,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148000,76.601000,78.333000,104.870693,2.873457e+06,-999.00000,12.563000,14.590000,13.750000,2017,0
2,100.000000,9.130677,10.0,6.380329,4111.294110,11737.409353,77.735000,75.307000,76.499000,17.377715,4.138920e+07,-999.00000,21.114000,10.021000,11.996000,2017,0
3,41.962894,19.079250,6.0,2.791503,4095.812942,7310.901738,63.252000,57.677000,60.379000,23.916538,2.981675e+07,-999.00000,7.467000,6.769000,7.119000,2017,0
4,90.283638,27.360620,9.0,5.561266,6108.588220,14562.367966,73.493366,69.921250,71.622526,36.669804,4.118990e+08,-999.00000,20.361396,8.097088,10.641313,2017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,100.000000,25.322499,10.0,5.917897,2566.596950,7771.189420,79.442000,71.207000,75.317000,308.125246,9.554040e+07,8.66000,1.831000,2.142000,1.993000,2018,0
430,100.000000,24.735190,10.0,-999.000000,3562.330943,6472.121785,75.595000,72.263000,73.895000,758.984551,4.569087e+06,-999.00000,41.849998,22.393999,26.256001,2018,0
431,89.588551,40.949620,10.0,9.848779,11385.675227,17025.827360,74.871815,70.398743,72.563282,59.617978,7.591945e+09,6.18124,5.580954,5.271615,5.392213,2018,1
432,39.812622,11.554664,7.0,4.934843,1516.390661,3607.304442,66.447000,60.533000,63.510000,23.341479,1.735182e+07,6.54000,12.448000,10.614000,11.500000,2018,0


In [20]:
## Now assign them back to dataframe
wdi[numeric_cols] = wdi[numeric_cols].fillna(-999)

In [21]:
wdi[cat_cols].fillna('missing')

Unnamed: 0,country_name,country_category
0,Afghanistan,DEVELOPING
1,Albania,DEVELOPING
2,Algeria,DEVELOPING
3,Angola,DEVELOPING
4,Arab World,missing
...,...,...
429,Vietnam,DEVELOPING
430,West Bank and Gaza,DEVELOPING
431,World,missing
432,Zambia,DEVELOPING


In [22]:
# now assign back to dataframe
wdi[cat_cols] = wdi[cat_cols].fillna('missing')

In [24]:
# Now all columns don't have NaN values
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 434 non-null    float64
 3   compulsory_education_years      434 non-null    float64
 4   health_expenditure_pct_of_gdp   434 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              434 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  434 

In [25]:
wdi['alcohol_consumption_per_capita'].value_counts(dropna=False)

alcohol_consumption_per_capita
-999.000000    219
 0.690000        3
 9.230000        2
 12.030000       2
 0.682988        2
              ... 
 1.110000        1
 5.380000        1
 6.890000        1
 2.730000        1
 4.670000        1
Name: count, Length: 205, dtype: int64

In [26]:
wdi['country_category'].value_counts(dropna=False)

country_category
DEVELOPING    224
missing       152
DEVELOPED      58
Name: count, dtype: int64

## Simple imputer in sklearn library

In [29]:
# need to roll back to original dataset to demonstrate with missing data
wdi = pd.read_pickle('wdi.pkl')
numeric_cols = wdi.select_dtypes(include = 'number').columns
cat_cols = wdi.select_dtypes(exclude = 'number').columns

In [30]:
from sklearn.impute import SimpleImputer

In [31]:
## Set up class object with arguments. We set strategy to be constant and empty values to be -999
# Basically constant says look at missing data, if missing then set to -999
SimpleImputer(strategy='constant',fill_value=-999)

In [32]:
simple_imp_num = SimpleImputer(strategy='constant',fill_value=-999)

#### fit the new imputer to the data, only numeric columns

In [33]:
# It retuned an array with the corrected data setting missing vals to -999
simple_imp_num.fit_transform(wdi[numeric_cols])

array([[9.77000000e+01, 1.21323506e+00, 9.00000000e+00, ...,
        1.11840000e+01, 2.01700000e+03, 0.00000000e+00],
       [1.00000000e+02, 3.17140757e+01, 9.00000000e+00, ...,
        1.37500000e+01, 2.01700000e+03, 0.00000000e+00],
       [1.00000000e+02, 9.13067681e+00, 1.00000000e+01, ...,
        1.19960003e+01, 2.01700000e+03, 0.00000000e+00],
       ...,
       [8.95885512e+01, 4.09496204e+01, 1.00000000e+01, ...,
        5.39221306e+00, 2.01800000e+03, 1.00000000e+00],
       [3.98126221e+01, 1.15546644e+01, 7.00000000e+00, ...,
        1.15000000e+01, 2.01800000e+03, 0.00000000e+00],
       [4.10415840e+01, 6.62520801e+00, 7.00000000e+00, ...,
        5.08699989e+00, 2.01800000e+03, 0.00000000e+00]])

In [34]:
type(simple_imp_num.fit_transform(wdi[numeric_cols]))

numpy.ndarray

In [35]:
# has not changed yet we need to assign it to update the wdi df
wdi[numeric_cols]

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.700000,1.213235,9.0,11.777236,519.884773,2058.383832,65.656000,62.701000,64.130000,55.595993,3.629640e+07,,14.090000,10.416000,11.184000,2017,0
1,100.000000,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148000,76.601000,78.333000,104.870693,2.873457e+06,,12.563000,14.590000,13.750000,2017,0
2,100.000000,9.130677,10.0,6.380329,4111.294110,11737.409353,77.735000,75.307000,76.499000,17.377715,4.138920e+07,,21.114000,10.021000,11.996000,2017,0
3,41.962894,19.079250,6.0,2.791503,4095.812942,7310.901738,63.252000,57.677000,60.379000,23.916538,2.981675e+07,,7.467000,6.769000,7.119000,2017,0
4,90.283638,27.360620,9.0,5.561266,6108.588220,14562.367966,73.493366,69.921250,71.622526,36.669804,4.118990e+08,,20.361396,8.097088,10.641313,2017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,100.000000,25.322499,10.0,5.917897,2566.596950,7771.189420,79.442000,71.207000,75.317000,308.125246,9.554040e+07,8.66000,1.831000,2.142000,1.993000,2018,0
430,100.000000,24.735190,10.0,,3562.330943,6472.121785,75.595000,72.263000,73.895000,758.984551,4.569087e+06,,41.849998,22.393999,26.256001,2018,0
431,89.588551,40.949620,10.0,9.848779,11385.675227,17025.827360,74.871815,70.398743,72.563282,59.617978,7.591945e+09,6.18124,5.580954,5.271615,5.392213,2018,1
432,39.812622,11.554664,7.0,4.934843,1516.390661,3607.304442,66.447000,60.533000,63.510000,23.341479,1.735182e+07,6.54000,12.448000,10.614000,11.500000,2018,0


In [36]:
imputed_num = simple_imp_num.fit_transform(wdi[numeric_cols])

In [37]:
# notice this is the same as the base wdi df
imputed_num.shape

(434, 17)

In [38]:
wdi[numeric_cols].shape

(434, 17)

In [39]:
# assign newly imputed data to wdi to update missing values
wdi[numeric_cols] = imputed_num

In [40]:
# now missing vals are replaced with -999. See alcohol_consumption_per_capita
wdi[numeric_cols]

Unnamed: 0,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,is_region
0,97.700000,1.213235,9.0,11.777236,519.884773,2058.383832,65.656000,62.701000,64.130000,55.595993,3.629640e+07,-999.00000,14.090000,10.416000,11.184000,2017.0,0.0
1,100.000000,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148000,76.601000,78.333000,104.870693,2.873457e+06,-999.00000,12.563000,14.590000,13.750000,2017.0,0.0
2,100.000000,9.130677,10.0,6.380329,4111.294110,11737.409353,77.735000,75.307000,76.499000,17.377715,4.138920e+07,-999.00000,21.114000,10.021000,11.996000,2017.0,0.0
3,41.962894,19.079250,6.0,2.791503,4095.812942,7310.901738,63.252000,57.677000,60.379000,23.916538,2.981675e+07,-999.00000,7.467000,6.769000,7.119000,2017.0,0.0
4,90.283638,27.360620,9.0,5.561266,6108.588220,14562.367966,73.493366,69.921250,71.622526,36.669804,4.118990e+08,-999.00000,20.361396,8.097088,10.641313,2017.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,100.000000,25.322499,10.0,5.917897,2566.596950,7771.189420,79.442000,71.207000,75.317000,308.125246,9.554040e+07,8.66000,1.831000,2.142000,1.993000,2018.0,0.0
430,100.000000,24.735190,10.0,-999.000000,3562.330943,6472.121785,75.595000,72.263000,73.895000,758.984551,4.569087e+06,-999.00000,41.849998,22.393999,26.256001,2018.0,0.0
431,89.588551,40.949620,10.0,9.848779,11385.675227,17025.827360,74.871815,70.398743,72.563282,59.617978,7.591945e+09,6.18124,5.580954,5.271615,5.392213,2018.0,1.0
432,39.812622,11.554664,7.0,4.934843,1516.390661,3607.304442,66.447000,60.533000,63.510000,23.341479,1.735182e+07,6.54000,12.448000,10.614000,11.500000,2018.0,0.0


In [41]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 434 non-null    float64
 3   compulsory_education_years      434 non-null    float64
 4   health_expenditure_pct_of_gdp   434 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              434 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  434 

### Now use SimpleImputer for the category data
#### Step 1) set up SimpleImputer class object with strategy
#### Step 2) fit the new imputer to the data using fit_transform
#### Step 3) assign the data to variable and then update original dataframe

In [42]:
# Step 1
simple_imp_cat = SimpleImputer(strategy='constant',fill_value='MISSING')

In [43]:
# Step 2
imputer_cat = simple_imp_cat.fit_transform(wdi[cat_cols])

In [44]:
# Setp 3
wdi[cat_cols] = imputer_cat

In [45]:
wdi[cat_cols]

Unnamed: 0,country_name,country_category
0,Afghanistan,DEVELOPING
1,Albania,DEVELOPING
2,Algeria,DEVELOPING
3,Angola,DEVELOPING
4,Arab World,MISSING
...,...,...
429,Vietnam,DEVELOPING
430,West Bank and Gaza,DEVELOPING
431,World,MISSING
432,Zambia,DEVELOPING


In [47]:
# Now all data is filled.
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 434 non-null    float64
 3   compulsory_education_years      434 non-null    float64
 4   health_expenditure_pct_of_gdp   434 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              434 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  434 