## Clean Gapminder

1. Rename columns: use the method from the example above to properly name any columns that seem mislabeled in the population dataset. The population dataset was given in the EDA lesson warmer

2. Missing data: first check and see which and how much data is missing in the population dataset

3. Remove missing data: drop all observations with missing data

4. Filter for relevant data: filter the dataset that it begins with the year 1950

5. Make data persistant: save the dataset as a .csv file in your data folder as they will be used for the week’s project

6. Repeat for the the life_expectancy, and fertility_rate datasets which are available below

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

### Cleaning population.csv

In [3]:
# read the file population.csv
df_pop = pd.read_csv('../../data/population.csv')
df_pop

Unnamed: 0,Total population,year,population
0,Abkhazia,1800,
1,Afghanistan,1800,3280000.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,410445.0
4,Algeria,1800,2503218.0
...,...,...,...
22270,Northern Marianas,2015,
22271,South Georgia and the South Sandwich Islands,2015,
22272,US Minor Outlying Islands,2015,
22273,Virgin Islands,2015,


In [4]:
#rename columns
df_pop.columns

Index(['Total population', 'year', 'population'], dtype='object')

In [5]:
df_pop.rename(columns={'Total population':'country'},inplace=True)

In [6]:
df_pop

Unnamed: 0,country,year,population
0,Abkhazia,1800,
1,Afghanistan,1800,3280000.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,410445.0
4,Algeria,1800,2503218.0
...,...,...,...
22270,Northern Marianas,2015,
22271,South Georgia and the South Sandwich Islands,2015,
22272,US Minor Outlying Islands,2015,
22273,Virgin Islands,2015,


In [7]:
# check missing data
df_pop.isnull().sum()

country          0
year             0
population    2099
dtype: int64

In [8]:
# remove missing data
df_pop.dropna(inplace=True)

In [9]:
df_pop

Unnamed: 0,country,year,population
1,Afghanistan,1800,3280000.0
3,Albania,1800,410445.0
4,Algeria,1800,2503218.0
5,American Samoa,1800,8170.0
6,Andorra,1800,2654.0
...,...,...,...
22256,Zambia,2015,16211767.0
22257,Zimbabwe,2015,15602751.0
22259,South Sudan,2015,12339812.0
22260,Curaçao,2015,157203.0


In [10]:
#Filter for relevant data: filter the dataset that it begins with the year 1950
df_pop_g1950 = df_pop[ df_pop['year'] > 1950 ]
df_pop_g1950 


Unnamed: 0,country,year,population
4401,Afghanistan,1951,7839426.0
4402,Akrotiri and Dhekelia,1951,10737.0
4403,Albania,1951,1287499.0
4404,Algeria,1951,9039913.0
4405,American Samoa,1951,19295.0
...,...,...,...
22256,Zambia,2015,16211767.0
22257,Zimbabwe,2015,15602751.0
22259,South Sudan,2015,12339812.0
22260,Curaçao,2015,157203.0


In [11]:
# save the file as csv
df_pop_g1950.to_csv('../../data/population_cleaned.csv', index=False )

### Cleaning life_expectancy.csv

In [12]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install --upgrade xlwt

Note: you may need to restart the kernel to use updated packages.


In [14]:
df_life_exp = pd.read_excel('../../data/life_expectancy.xls')
df_life_exp

Unnamed: 0,Life expectancy,year,life expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.40
4,Algeria,1800,28.82
...,...,...,...
56415,Yugoslavia,2016,
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69
56418,Åland,2016,


In [15]:
#change column names
df_life_exp.columns

Index(['Life expectancy', 'year', 'life expectancy'], dtype='object')

In [16]:
df_life_exp.rename(columns={'Life expectancy':'country'},inplace=True)
df_life_exp

Unnamed: 0,country,year,life expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.40
4,Algeria,1800,28.82
...,...,...,...
56415,Yugoslavia,2016,
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69
56418,Åland,2016,


In [17]:
# check missing data
df_life_exp.isnull().sum()

country                0
year                   0
life expectancy    12563
dtype: int64

In [18]:
# remove missing data
df_life_exp.dropna(inplace=True)

In [19]:
df_life_exp

Unnamed: 0,country,year,life expectancy
1,Afghanistan,1800,28.21
3,Albania,1800,35.40
4,Algeria,1800,28.82
7,Angola,1800,26.98
9,Antigua and Barbuda,1800,33.54
...,...,...,...
56411,Virgin Islands (U.S.),2016,80.82
56414,Yemen,2016,64.92
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69


In [20]:
#Filter for relevant data: filter the dataset that it begins with the year 1950
df_life_exp_g1950 = df_life_exp[ df_life_exp['year'] > 1950 ] 
df_life_exp_g1950

Unnamed: 0,country,year,life expectancy
39261,Afghanistan,1951,27.13
39263,Albania,1951,54.72
39264,Algeria,1951,43.03
39267,Angola,1951,31.05
39269,Antigua and Barbuda,1951,58.26
...,...,...,...
56411,Virgin Islands (U.S.),2016,80.82
56414,Yemen,2016,64.92
56416,Zambia,2016,57.10
56417,Zimbabwe,2016,61.69


In [21]:
# save the file as csv #couldn't save it as xls
df_life_exp_g1950.to_csv('../../data/life_expectancy_cleaned.csv', index=False)

### Cleaning fertility_rate.csv

In [22]:
df_fert_rate = pd.read_csv('../../data/fertility_rate.csv')
df_fert_rate


Unnamed: 0,Total fertility rate,year,fertility
0,Abkhazia,1800,
1,Afghanistan,1800,7.00
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,4.60
4,Algeria,1800,6.99
...,...,...,...
56154,Yemen,2015,3.83
56155,Yugoslavia,2015,
56156,Zambia,2015,5.59
56157,Zimbabwe,2015,3.35


In [23]:
#change columns name
df_fert_rate.columns

Index(['Total fertility rate', 'year', 'fertility'], dtype='object')

In [24]:
df_fert_rate.rename(columns={'Total fertility rate':'country'},inplace=True)
df_fert_rate

Unnamed: 0,country,year,fertility
0,Abkhazia,1800,
1,Afghanistan,1800,7.00
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,4.60
4,Algeria,1800,6.99
...,...,...,...
56154,Yemen,2015,3.83
56155,Yugoslavia,2015,
56156,Zambia,2015,5.59
56157,Zimbabwe,2015,3.35


In [25]:
# check missing data
df_fert_rate.isnull().sum()

country          0
year             0
fertility    12747
dtype: int64

In [26]:
# remove missing data
df_fert_rate.dropna(inplace=True)

In [27]:
df_fert_rate

Unnamed: 0,country,year,fertility
1,Afghanistan,1800,7.00
3,Albania,1800,4.60
4,Algeria,1800,6.99
7,Angola,1800,6.93
9,Antigua and Barbuda,1800,5.00
...,...,...,...
56150,Vietnam,2015,1.70
56151,Virgin Islands (U.S.),2015,2.45
56154,Yemen,2015,3.83
56156,Zambia,2015,5.59


In [28]:
#Filter for relevant data: filter the dataset that it begins with the year 1950
df_fert_rate_g1950 = df_fert_rate[ df_fert_rate['year'] > 1950 ]
df_fert_rate_g1950

Unnamed: 0,country,year,fertility
39261,Afghanistan,1951,7.67
39263,Albania,1951,5.90
39264,Algeria,1951,7.65
39267,Angola,1951,6.94
39269,Antigua and Barbuda,1951,4.46
...,...,...,...
56150,Vietnam,2015,1.70
56151,Virgin Islands (U.S.),2015,2.45
56154,Yemen,2015,3.83
56156,Zambia,2015,5.59


In [29]:
# save the file as csv 
df_fert_rate_g1950.to_csv('../../data/fertility_rate_cleaned.csv', index=False)