## Tackling Missing Data: Imputing With Statistics and Missing Indicators

In [7]:
import pandas as pd
wdi = pd.read_pickle('wdi.pkl')
numeric_cols = wdi.select_dtypes(include ='number').columns
cat_cols = wdi.select_dtypes(exclude = 'number').columns

### fillna() Methods

In [13]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 406 non-null    float64
 3   compulsory_education_years      412 non-null    float64
 4   health_expenditure_pct_of_gdp   427 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              432 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  215 

### Numeric Columns Using fillna() with mean() or median()

In [10]:
wdi[numeric_cols].mean()

access_to_electricity_pct         8.421552e+01
atms_per_100000                   4.812857e+01
compulsory_education_years        9.701456e+00
health_expenditure_pct_of_gdp     6.433350e+00
gdp_per_capita_usd                1.359989e+04
gdp_per_capita_ppp                1.992523e+04
life_expectancy_female            7.456947e+01
life_expectancy_male              6.986652e+01
life_expectancy                   7.217627e+01
population_density                2.099738e+02
population                        3.690135e+08
alcohol_consumption_per_capita    6.016516e+00
unemployment_rate_female          8.404762e+00
unemployment_rate_male            6.465617e+00
unemployment_rate                 7.047682e+00
year                              2.017500e+03
is_region                         2.119816e-01
dtype: float64

In [12]:
wdi[numeric_cols].median()

access_to_electricity_pct         9.891294e+01
atms_per_100000                   3.941107e+01
compulsory_education_years        1.000000e+01
health_expenditure_pct_of_gdp     6.006967e+00
gdp_per_capita_usd                5.783711e+03
gdp_per_capita_ppp                1.310845e+04
life_expectancy_female            7.608850e+01
life_expectancy_male              7.081350e+01
life_expectancy                   7.358550e+01
population_density                6.961068e+01
population                        1.604842e+07
alcohol_consumption_per_capita    5.837431e+00
unemployment_rate_female          5.836087e+00
unemployment_rate_male            5.351708e+00
unemployment_rate                 5.606500e+00
year                              2.017500e+03
is_region                         0.000000e+00
dtype: float64

In [14]:
wdi[numeric_cols] = wdi[numeric_cols].fillna(wdi[numeric_cols].mean())

In [15]:
# The value of 6.016516 now has 219 instances in the database. As it was the mean amount and then it was used to replace all NaN values.
wdi['alcohol_consumption_per_capita'].value_counts()

alcohol_consumption_per_capita
6.016516     219
0.690000       3
9.230000       2
12.030000      2
0.682988       2
            ... 
1.110000       1
5.380000       1
6.890000       1
2.730000       1
4.670000       1
Name: count, Length: 205, dtype: int64

### Categorical Columns Finding Most Frequent Value And Assigning To Missing Values Using fillna()

In [16]:
wdi[cat_cols]

Unnamed: 0,country_name,country_category
0,Afghanistan,DEVELOPING
1,Albania,DEVELOPING
2,Algeria,DEVELOPING
3,Angola,DEVELOPING
4,Arab World,
...,...,...
429,Vietnam,DEVELOPING
430,West Bank and Gaza,DEVELOPING
431,World,
432,Zambia,DEVELOPING


In [17]:
# Notice that it gives us a 'top' category. Those are the most frequent categories selected in those columns
wdi[cat_cols].describe()

Unnamed: 0,country_name,country_category
count,434,282
unique,217,2
top,Afghanistan,DEVELOPING
freq,2,224


In [22]:
# Can use the .loc property on this to grab just the 'top' row. Giving us the most frequent categories.
wdi[cat_cols].describe().loc['top']

country_name        Afghanistan
country_category     DEVELOPING
Name: top, dtype: object

In [23]:
# We can also use iloc. Needs the indexing, dont forget!
wdi[cat_cols].describe().iloc[2]

country_name        Afghanistan
country_category     DEVELOPING
Name: top, dtype: object

In [25]:
most_freq = wdi[cat_cols].describe().loc['top']

In [27]:
wdi[cat_cols] = wdi[cat_cols].fillna(most_freq)

In [29]:
wdi['country_category'].value_counts(dropna = False)

country_category
DEVELOPING    376
DEVELOPED      58
Name: count, dtype: int64

### Alright now check out the dataframe using .info() to see if every column has the same number of records and no missing data

In [31]:
# Great!
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 434 non-null    float64
 3   compulsory_education_years      434 non-null    float64
 4   health_expenditure_pct_of_gdp   434 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              434 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  434 

## SimpleImputer Method

In [35]:
## Reset the dataframe and import SimpleImputer
from sklearn.impute import SimpleImputer
wdi = pd.read_pickle('wdi.pkl')
numeric_cols = wdi.select_dtypes(include ='number').columns
cat_cols = wdi.select_dtypes(exclude = 'number').columns

### Numberic SimpleImputer

#### Remember the 3 steps for SimpleImputer
###### 1) Set up SimpleImputer class object
###### 2) fit_transform the object using the dataframe and assign to a veriable.
###### 3) Assign the new variable to your dataframe to update the data.

In [36]:
# Step 1
simple_imp_min = SimpleImputer(strategy= 'mean')

In [37]:
# Step 2
imputer_num = simple_imp_min.fit_transform(wdi[numeric_cols])

In [38]:
imputer_num.shape

(434, 17)

In [39]:
wdi[numeric_cols].shape

(434, 17)

In [40]:
#Step 3
wdi[numeric_cols] = imputer_num

In [41]:
# Check
# Notice that there are 219 instances of 6.016516. The means using the SimpleImputer has replaced the NaN values.
wdi['alcohol_consumption_per_capita'].value_counts()

alcohol_consumption_per_capita
6.016516     219
0.690000       3
9.230000       2
12.030000      2
0.682988       2
            ... 
1.110000       1
5.380000       1
6.890000       1
2.730000       1
4.670000       1
Name: count, Length: 205, dtype: int64

## Categorical SimpleImputer

#### Don't forget the steps for SimpleImputer
###### 1) Set SimpleImputer class object
###### 2) fit_transform the dataframe to the SimpleImputer and then assign to variable
###### 3) Update the Dataframe with the variable created in step 2

In [42]:
# Step 1
most_freq_imp = SimpleImputer(strategy='most_frequent')

In [43]:
# Step 2
imputer_cat = most_freq_imp.fit_transform(wdi[cat_cols])

In [44]:
# Step 3
wdi[cat_cols] = imputer_cat

In [46]:
# Check
wdi['country_category'].value_counts()

country_category
DEVELOPING    376
DEVELOPED      58
Name: count, dtype: int64

### Alright! Lets check the dataframe to see if there is any missing data

In [47]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 434 non-null    float64
 3   compulsory_education_years      434 non-null    float64
 4   health_expenditure_pct_of_gdp   434 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              434 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  434 

## How To Mark Missing Data Using SimpleImputer

In [49]:
# Reset the dataframe to have missing values
wdi = pd.read_pickle('wdi.pkl')
numeric_cols = wdi.select_dtypes(include ='number').columns
cat_cols = wdi.select_dtypes(exclude = 'number').columns

In [51]:
# Using the add_indicator where if True will account for missing data despite imputation. - Found using documentation using shift+tab next to 'mean' 
simple_imp_ind = SimpleImputer(strategy='mean',add_indicator = True)

In [52]:
wdi[['compulsory_education_years']]

Unnamed: 0,compulsory_education_years
0,9.0
1,9.0
2,10.0
3,6.0
4,9.0
...,...
429,10.0
430,10.0
431,10.0
432,7.0


In [53]:
## Using just one column to make it more clear as to whats happening. 
# Remember to use 2 [] for the single column since if just one is used it will be a series, not a dataframe, whcih is what we need to use fit_transform.
imputed_ind = simple_imp_ind.fit_transform(wdi[['compulsory_education_years']])

In [57]:
## Using slicing to show the second column(head won't work). Notice the 0's and the 1's. A 1 indicated a change was made in that row where a 0 means it was unchanged.
# So 9.70145631 for example was changed to have the mean as indicated by the 1 in the 2nd column
imputed_ind[:20]

array([[ 9.        ,  0.        ],
       [ 9.        ,  0.        ],
       [10.        ,  0.        ],
       [ 6.        ,  0.        ],
       [ 9.        ,  0.        ],
       [14.        ,  0.        ],
       [12.        ,  0.        ],
       [10.        ,  0.        ],
       [13.        ,  0.        ],
       [10.        ,  0.        ],
       [12.        ,  0.        ],
       [ 9.        ,  0.        ],
       [ 5.        ,  0.        ],
       [11.        ,  0.        ],
       [ 9.        ,  0.        ],
       [12.        ,  0.        ],
       [ 8.        ,  0.        ],
       [ 6.        ,  0.        ],
       [ 9.70145631,  1.        ],
       [14.        ,  0.        ]])

In [58]:
imputed_ind.shape

(434, 2)

In [61]:
# Need to create a 2nd column to add the imputed data as there is now a 2nd column showing which columns changed
# We created 'compulsory_education_years_missing'
wdi[['compulsory_education_years','compulsory_education_years_missing']] = imputed_ind

In [63]:
# So now we can see in our actual dataframe a record of the rows that were modified.
# Like in row 18. It has a 1
# This is how the add_indicator works within SimpleImputer
wdi[['compulsory_education_years','compulsory_education_years_missing']].head(20)

Unnamed: 0,compulsory_education_years,compulsory_education_years_missing
0,9.0,0.0
1,9.0,0.0
2,10.0,0.0
3,6.0,0.0
4,9.0,0.0
5,14.0,0.0
6,12.0,0.0
7,10.0,0.0
8,13.0,0.0
9,10.0,0.0
