In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
url = 'https://en.wikipedia.org/wiki/Education_Index'

In [30]:
# reading first table from the url
education = pd.read_html(url)[0]

In [31]:
education.sample(5)

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61
35,China,0.405,0.411,0.42,0.427,0.427,0.442,0.45,0.457,0.465,...,,,,,,,,,,
90,Korea (Republic of),0.676,0.686,0.694,0.707,0.723,0.737,0.751,0.772,0.768,...,,,,,,,,,,
94,Latvia,0.604,0.61,0.613,0.609,0.609,0.619,0.631,0.645,0.666,...,,,,,,,,,,
78,Indonesia,0.389,0.391,0.394,0.398,0.409,0.42,0.441,0.467,0.485,...,,,,,,,,,,
154,Slovenia,0.696,0.699,0.709,0.708,0.714,0.724,0.732,0.744,0.761,...,,,,,,,,,,


In [32]:
# Data Cleaning

In [33]:
# removing columns with almost all values as Null
education.dropna(axis=1, thresh=10, inplace=True)

In [34]:
# checking dtypes of different columns
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 31 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  188 non-null    object 
 1   1990     143 non-null    float64
 2   1991     143 non-null    float64
 3   1992     143 non-null    float64
 4   1993     143 non-null    float64
 5   1994     143 non-null    float64
 6   1995     147 non-null    float64
 7   1996     147 non-null    float64
 8   1997     147 non-null    float64
 9   1998     147 non-null    float64
 10  1999     150 non-null    float64
 11  2000     173 non-null    float64
 12  2001     173 non-null    float64
 13  2002     174 non-null    float64
 14  2003     176 non-null    float64
 15  2004     178 non-null    float64
 16  2005     185 non-null    float64
 17  2006     185 non-null    float64
 18  2007     185 non-null    float64
 19  2008     185 non-null    float64
 20  2009     185 non-null    float64
 21  2010     187 non

In [35]:
# converting '2019' column dtype to float
education['2019'] = education['2019'].str.replace('-', '0')
education['2019'] = education['2019'].astype(float)
education.at[153,'2019'] = np.mean(education['2019'])

In [36]:
# handling Null values
education.fillna(method='bfill', axis=0, inplace=True)

In [37]:
education.sample(5)

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
106,Mali,0.081,0.085,0.091,0.098,0.106,0.114,0.123,0.136,0.146,...,0.277,0.283,0.284,0.279,0.285,0.274,0.278,0.282,0.286,0.286
37,Comoros,0.455,0.455,0.457,0.458,0.459,0.46,0.459,0.458,0.456,...,0.434,0.457,0.465,0.468,0.47,0.471,0.474,0.476,0.479,0.482
124,Niger,0.082,0.084,0.086,0.09,0.093,0.097,0.101,0.105,0.109,...,0.18,0.189,0.2,0.209,0.217,0.226,0.233,0.245,0.247,0.249
183,Venezuela,0.444,0.457,0.466,0.471,0.475,0.48,0.484,0.487,0.492,...,0.674,0.705,0.708,0.724,0.725,0.724,0.722,0.7,0.7,0.7
166,Tanzania (United Republic of),0.273,0.278,0.28,0.284,0.285,0.287,0.286,0.288,0.297,...,0.416,0.415,0.426,0.415,0.418,0.431,0.436,0.431,0.425,0.429


In [38]:
# checkink if all values are not null
education.notna().any()

Country    True
1990       True
1991       True
1992       True
1993       True
1994       True
1995       True
1996       True
1997       True
1998       True
1999       True
2000       True
2001       True
2002       True
2003       True
2004       True
2005       True
2006       True
2007       True
2008       True
2009       True
2010       True
2011       True
2012       True
2013       True
2014       True
2015       True
2016       True
2017       True
2018       True
2019       True
dtype: bool

In [39]:
# top 5 countries with highest education index
education.set_index('Country').loc[:,'2019'].nlargest(5)

Country
Germany           0.943
China             0.937
Norway            0.930
United Kingdom    0.928
Finland           0.927
Name: 2019, dtype: float64

In [40]:
# defining a function to find growth rate in education index for a country between two time periods
def growth_rate(start_year, end_year, country):
    index = education[education.Country == country].index[0]
    return (education.at[index, str(end_year)] - education.at[index, str(start_year)]) / education.at[index, str(end_year)] * 100

In [41]:
growth_rate(2010, 2019, 'India')

13.873873873873885

In [42]:
# top 5 countries with highest growth rate between 2010 and 2019
rate_series = pd.Series([growth_rate(2010, 2019, country) for country in education.Country],
                       index = education.Country)
rate_series.nlargest(5)

Country
Marshall Islands    52.192362
China               35.752401
Niger               27.710843
Burkina Faso        25.641026
Bhutan              21.774194
dtype: float64

In [43]:
education[education.Country == 'India']

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
77,India,0.311,0.317,0.324,0.331,0.338,0.344,0.351,0.358,0.365,...,0.478,0.491,0.505,0.514,0.53,0.54,0.544,0.558,0.553,0.555
