## Identifying Missing Data

In [1]:
import pandas as pd

In [2]:
wdi =  pd.read_pickle('wdi.pkl')

### How to spot missing data

In [4]:
## using info() look at the non-null totals vs the total entries. There are missing vals in columns who do not match like atms_per_100000
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    434 non-null    object 
 1   access_to_electricity_pct       434 non-null    float64
 2   atms_per_100000                 406 non-null    float64
 3   compulsory_education_years      412 non-null    float64
 4   health_expenditure_pct_of_gdp   427 non-null    float64
 5   gdp_per_capita_usd              434 non-null    float64
 6   gdp_per_capita_ppp              434 non-null    float64
 7   life_expectancy_female          434 non-null    float64
 8   life_expectancy_male            434 non-null    float64
 9   life_expectancy                 434 non-null    float64
 10  population_density              432 non-null    float64
 11  population                      434 non-null    float64
 12  alcohol_consumption_per_capita  215 

In [13]:
# Checking this and looking at the origial csv there are blank values that are assigned as NaN
wdi[['alcohol_consumption_per_capita']]

Unnamed: 0,alcohol_consumption_per_capita
0,
1,
2,
3,
4,
...,...
429,8.66000
430,
431,6.18124
432,6.54000


In [12]:
# We could have read the csv using this method. By using na_values we can choose which values in the file are NaN values and set them to be read as so.
# Will not be using this df but just thought it was intresting.

replace_nan = pd.read_csv('world_development_indicators.csv', na_values=[-999,'missing'])

In [15]:
## Using the describe method with .T allows it to be transposed. We did so becasue the original frame was too long
wdi.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
access_to_electricity_pct,434.0,84.21552,24.38962,9.3,70.8425,98.91294,100.0,100.0
atms_per_100000,406.0,48.12857,43.07826,1.213235,14.30535,39.41107,65.05932,272.5928
compulsory_education_years,412.0,9.701456,2.206372,5.0,9.0,10.0,11.0,16.0
health_expenditure_pct_of_gdp,427.0,6.43335,2.622327,2.138482,4.4933,6.006967,7.960797,17.00361
gdp_per_capita_usd,434.0,13599.89,18647.56,271.752044,1971.836,5783.711,15561.55,116654.3
gdp_per_capita_ppp,434.0,19925.23,19970.7,773.571858,4989.814,13108.45,28646.34,116786.5
life_expectancy_female,434.0,74.56947,7.676443,54.354,69.05075,76.0885,80.1445,87.7
life_expectancy_male,434.0,69.86652,7.144252,49.837,65.29775,70.8135,74.6595,82.3
life_expectancy,434.0,72.17627,7.334379,52.24,66.98875,73.5855,77.1625,84.93415
population_density,432.0,209.9738,747.2865,2.004286,35.04564,69.61068,133.9212,7952.998


In [16]:
wdi.isna()

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,country_category,is_region
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
430,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False
431,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
432,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [17]:
wdi.notna()

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,country_category,is_region
0,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
430,True,True,True,True,False,True,True,True,True,True,True,True,False,True,True,True,True,True,True
431,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
432,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [18]:
wdi.isna().sum()

country_name                        0
access_to_electricity_pct           0
atms_per_100000                    28
compulsory_education_years         22
health_expenditure_pct_of_gdp       7
gdp_per_capita_usd                  0
gdp_per_capita_ppp                  0
life_expectancy_female              0
life_expectancy_male                0
life_expectancy                     0
population_density                  2
population                          0
alcohol_consumption_per_capita    219
unemployment_rate_female            0
unemployment_rate_male              0
unemployment_rate                   0
year                                0
country_category                  152
is_region                           0
dtype: int64

In [22]:
# by default doesn't include nan values so use dropna = False
wdi['alcohol_consumption_per_capita'].value_counts(dropna=False)

alcohol_consumption_per_capita
NaN          219
0.690000       3
9.230000       2
12.030000      2
0.682988       2
            ... 
1.110000       1
5.380000       1
6.890000       1
2.730000       1
4.670000       1
Name: count, Length: 205, dtype: int64

In [24]:
# using .sum and setting axis =1 shows the rows of the data frame and the number of missing values in each row.
# For example 430 has 2 missing values
wdi.isna().sum(axis=1)

0      1
1      1
2      1
3      1
4      2
      ..
429    0
430    2
431    1
432    0
433    0
Length: 434, dtype: int64

In [25]:
num_missing_by_row = wdi.isna().sum(axis=1)

In [30]:
# This them shows us that there are 319 rows in the df that have missing data in more than 0 columns
(num_missing_by_row>0).sum()

319

In [32]:
# This is showing all rows in wdi with missing data using the variable we created.
wdi[num_missing_by_row>0]

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,country_category,is_region
0,Afghanistan,97.700000,1.213235,9.0,11.777236,519.884773,2058.383832,65.656000,62.701000,64.130000,55.595993,3.629640e+07,,14.090000,10.416000,11.184000,2017,DEVELOPING,0
1,Albania,100.000000,31.714076,9.0,5.010597,4531.020806,13037.010016,80.148000,76.601000,78.333000,104.870693,2.873457e+06,,12.563000,14.590000,13.750000,2017,DEVELOPING,0
2,Algeria,100.000000,9.130677,10.0,6.380329,4111.294110,11737.409353,77.735000,75.307000,76.499000,17.377715,4.138920e+07,,21.114000,10.021000,11.996000,2017,DEVELOPING,0
3,Angola,41.962894,19.079250,6.0,2.791503,4095.812942,7310.901738,63.252000,57.677000,60.379000,23.916538,2.981675e+07,,7.467000,6.769000,7.119000,2017,DEVELOPING,0
4,Arab World,90.283638,27.360620,9.0,5.561266,6108.588220,14562.367966,73.493366,69.921250,71.622526,36.669804,4.118990e+08,,20.361396,8.097088,10.641313,2017,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Upper middle income,99.423212,54.376093,10.0,5.739787,8906.243876,16701.291315,78.040738,72.790446,75.328985,49.316919,2.837704e+09,6.170874,5.809633,5.812897,5.811807,2018,,1
426,Uruguay,100.000000,120.067132,14.0,9.202700,17277.970111,22116.786596,81.359000,73.958000,77.770000,19.708028,3.449299e+06,6.920000,10.089000,6.873000,8.336000,2018,,0
428,Vanuatu,61.864769,48.050330,,3.373726,3125.257989,3204.679541,71.990000,68.847000,70.323000,24.009844,2.926800e+05,2.250000,4.898000,3.958000,4.368000,2018,DEVELOPING,0
430,West Bank and Gaza,100.000000,24.735190,10.0,,3562.330943,6472.121785,75.595000,72.263000,73.895000,758.984551,4.569087e+06,,41.849998,22.393999,26.256001,2018,DEVELOPING,0
