## Removing Unnecessary Columns or Rows

In [3]:
import pandas as pd
wdi = pd.read_csv('world_development_indicators.csv')

## Check duplicate columns

In [4]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446 entries, 0 to 445
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    446 non-null    object 
 1   access_to_electricity_pct       446 non-null    float64
 2   atms_per_100000                 417 non-null    float64
 3   compulsory_education_years      424 non-null    float64
 4   health_expenditure_pct_of_gdp   439 non-null    float64
 5   gdp_per_capita_usd              446 non-null    float64
 6   gdp_per_capita_ppp              446 non-null    float64
 7   life_expectancy_female          446 non-null    float64
 8   life_expectancy_male            446 non-null    float64
 9   life_expectancy                 446 non-null    float64
 10  population_density              444 non-null    float64
 11  population                      446 non-null    float64
 12  alcohol_consumption_per_capita  220 

In [5]:
wdi[['country_name','Country Name','Country Code', 'planet']]

Unnamed: 0,country_name,Country Name,Country Code,planet
0,Afghanistan,Afghanistan,AFG,Earth
1,Albania,Albania,ALB,Earth
2,Algeria,Algeria,DZA,Earth
3,Angola,Angola,AGO,Earth
4,Arab World,Arab World,ARB,Earth
...,...,...,...,...
441,Ukraine,Ukraine,UKR,Earth
442,Philippines,Philippines,PHL,Earth
443,Indonesia,Indonesia,IDN,Earth
444,United Arab Emirates,United Arab Emirates,ARE,Earth


In [6]:
wdi.nunique()

country_name                      217
access_to_electricity_pct         240
atms_per_100000                   359
compulsory_education_years         16
health_expenditure_pct_of_gdp     421
gdp_per_capita_usd                430
gdp_per_capita_ppp                430
life_expectancy_female            390
life_expectancy_male              402
life_expectancy                   425
population_density                428
population                        430
alcohol_consumption_per_capita    204
unemployment_rate_female          422
unemployment_rate_male            430
unemployment_rate                 428
year                                2
Country Name                      217
Country Code                      217
country_category                    2
is_region                           2
planet                              1
dtype: int64

In [7]:
wdi['planet'].value_counts()

planet
Earth    446
Name: count, dtype: int64

In [8]:
## Use normalize to get porportions
wdi['planet'].value_counts(normalize = True)

planet
Earth    1.0
Name: proportion, dtype: float64

## Compare columns to see if duplicate

In [9]:
# Method 1
## Shows they comtain the exact same info
wdi[wdi['country_name'] != wdi['Country Name']]

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,...,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,Country Name,Country Code,country_category,is_region,planet


In [10]:
# Method 2
wdi['country_name'].compare(wdi['Country Name'])

Unnamed: 0,self,other


In [11]:
wdi['country_name'].compare(wdi['Country Code'])

Unnamed: 0,self,other
0,Afghanistan,AFG
1,Albania,ALB
2,Algeria,DZA
3,Angola,AGO
4,Arab World,ARB
...,...,...
441,Ukraine,UKR
442,Philippines,PHL
443,Indonesia,IDN
444,United Arab Emirates,ARE


In [15]:
wdi[['country_name','Country Code']].value_counts()

country_name   Country Code
Euro area      EMU             3
Nepal          NPL             3
Timor-Leste    TLS             3
Belarus        BLR             3
Ukraine        UKR             3
                              ..
Guinea         GIN             2
Guinea-Bissau  GNB             2
Guyana         GUY             2
Haiti          HTI             2
Zimbabwe       ZWE             2
Name: count, Length: 217, dtype: int64

## After examing data we determined we can drop columns Country Code, Country Name, Planet for unnecessary/duplicate data

In [17]:
wdi = wdi.drop(columns = ['Country Name','Country Code','planet'])

In [18]:
wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446 entries, 0 to 445
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country_name                    446 non-null    object 
 1   access_to_electricity_pct       446 non-null    float64
 2   atms_per_100000                 417 non-null    float64
 3   compulsory_education_years      424 non-null    float64
 4   health_expenditure_pct_of_gdp   439 non-null    float64
 5   gdp_per_capita_usd              446 non-null    float64
 6   gdp_per_capita_ppp              446 non-null    float64
 7   life_expectancy_female          446 non-null    float64
 8   life_expectancy_male            446 non-null    float64
 9   life_expectancy                 446 non-null    float64
 10  population_density              444 non-null    float64
 11  population                      446 non-null    float64
 12  alcohol_consumption_per_capita  220 

## Duplicate Rows

In [19]:
wdi.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
441     True
442     True
443     True
444     True
445     True
Length: 446, dtype: bool

In [20]:
wdi[wdi.duplicated()]

Unnamed: 0,country_name,access_to_electricity_pct,atms_per_100000,compulsory_education_years,health_expenditure_pct_of_gdp,gdp_per_capita_usd,gdp_per_capita_ppp,life_expectancy_female,life_expectancy_male,life_expectancy,population_density,population,alcohol_consumption_per_capita,unemployment_rate_female,unemployment_rate_male,unemployment_rate,year,country_category,is_region
434,Belarus,100.0,55.667175,9.0,5.759748,5761.74712,18280.198919,79.2,69.3,74.129268,46.792242,9498264.0,,4.039,7.228,5.65,2017,,0
435,South Asia (IDA & IBRD),89.824336,9.996436,9.5,3.459347,1866.159019,5769.175349,70.505723,67.94939,69.171312,375.730176,1792836000.0,,5.448446,5.031694,5.125435,2017,,1
436,Timor-Leste,80.003647,11.9938,9.0,4.336554,1286.405751,3145.480591,71.092,67.053,69.007,83.608675,1243261.0,,6.017,3.352,4.566,2017,DEVELOPING,0
437,Tajikistan,99.3,,9.0,7.23948,826.621531,3313.519458,73.177,68.727,70.879,65.572714,9100837.0,3.28,10.083,11.759,11.133,2018,DEVELOPING,0
438,Euro area,100.0,85.639518,10.0,10.158286,37140.669853,45788.687122,84.564658,79.433986,81.932215,127.351,341163600.0,,9.445836,8.730447,9.060117,2017,,1
439,IDA total,58.911049,7.22612,9.0,4.080011,1365.854651,3861.811514,65.931954,62.597892,64.2478,73.040032,1633482000.0,3.684085,5.188501,4.321692,4.671868,2018,,1
440,Nepal,92.413193,10.970099,10.0,5.488255,911.444267,3099.306082,71.586,68.71,70.169,192.724967,27627120.0,,1.233,1.604,1.396,2017,DEVELOPING,0
441,Ukraine,100.0,97.386986,11.0,7.723917,3096.817402,12629.141661,76.72,66.69,71.582683,77.029671,44622520.0,8.32,7.424,10.029,8.799,2018,DEVELOPING,0
442,Philippines,93.0,28.135059,11.0,4.44606,3123.234229,8120.842309,75.268,66.971,70.952,352.729195,105173300.0,,2.702,2.459,2.552,2017,DEVELOPING,0
443,Indonesia,98.51,54.376093,9.0,2.87053,3893.846425,11648.542673,73.748,69.375,71.509,147.75219,267663400.0,0.57,4.251,4.679,4.511,2018,DEVELOPING,0


In [22]:
wdi = wdi.drop_duplicates(ignore_index=True)

In [23]:
# Originally 446 rows
wdi.shape

(434, 19)

In [24]:
wdi.to_pickle('wdi.pkl')