In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
url = 'https://en.wikipedia.org/wiki/Education_Index'

In [8]:
# reading first table from the url
education = pd.read_html(url)[0]

In [9]:
education.sample(5)

Unnamed: 0.1,Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
38,38,Congo,0.455,0.455,0.457,0.458,0.459,0.46,0.459,0.458,...,0.5,0.5,0.5,0.511,0.522,0.533,0.539,0.539,0.539,0.543
42,42,Cuba,0.624,0.626,0.627,0.633,0.619,0.621,0.636,0.64,...,0.824,0.813,0.773,0.76,0.76,0.767,0.768,0.776,0.783,0.79
146,146,Samoa,0.577,0.582,0.588,0.593,0.598,0.604,0.609,0.61,...,0.692,0.697,0.694,0.697,0.7,0.698,0.698,0.701,0.701,0.713
92,92,Kyrgyzstan,0.619,0.616,0.615,0.609,0.6,0.597,0.603,0.612,...,0.697,0.697,0.71,0.71,0.719,0.724,0.723,0.724,0.723,0.73
107,107,Malta,0.605,0.61,0.614,0.618,0.62,0.622,0.625,0.631,...,0.769,0.763,0.777,0.789,0.798,0.802,0.813,0.816,0.824,0.825


In [10]:
# Data Cleaning

In [11]:
# removing columns with almost all values as Null
education.dropna(axis=1, thresh=10, inplace=True)

In [12]:
# checking dtypes of different columns
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  188 non-null    int64  
 1   Country     188 non-null    object 
 2   1990        188 non-null    float64
 3   1991        188 non-null    float64
 4   1992        188 non-null    float64
 5   1993        188 non-null    float64
 6   1994        188 non-null    float64
 7   1995        188 non-null    float64
 8   1996        188 non-null    float64
 9   1997        188 non-null    float64
 10  1998        188 non-null    float64
 11  1999        188 non-null    float64
 12  2000        188 non-null    float64
 13  2001        188 non-null    float64
 14  2002        188 non-null    float64
 15  2003        188 non-null    float64
 16  2004        188 non-null    float64
 17  2005        188 non-null    float64
 18  2006        188 non-null    float64
 19  2007        188 non-null    f

In [14]:
# converting '2019' column dtype to float
education['2019'] = education['2019'].str.replace('-', '0')
education['2019'] = education['2019'].astype(float)
education.at[153,'2019'] = np.mean(education['2019'])

AttributeError: Can only use .str accessor with string values!

In [15]:
# handling Null values
education.fillna(method='bfill', axis=0, inplace=True)

In [16]:
education.sample(5)

Unnamed: 0.1,Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
50,50,Ecuador,0.551,0.549,0.553,0.557,0.561,0.565,0.567,0.57,...,0.646,0.651,0.695,0.697,0.695,0.717,0.704,0.707,0.709,0.702
137,137,Poland,0.665,0.671,0.676,0.698,0.71,0.713,0.723,0.741,...,0.832,0.833,0.817,0.853,0.849,0.855,0.866,0.866,0.866,0.869
174,174,Turkmenistan,0.25,0.254,0.243,0.256,0.262,0.268,0.3,0.333,...,0.624,0.625,0.626,0.626,0.627,0.628,0.628,0.628,0.645,0.653
48,48,Dominica,0.485,0.493,0.501,0.509,0.517,0.525,0.531,0.537,...,0.639,0.639,0.637,0.635,0.634,0.632,0.63,0.629,0.63,0.632
53,53,Equatorial Guinea,0.675,0.676,0.683,0.678,0.69,0.707,0.726,0.747,...,0.438,0.438,0.439,0.439,0.439,0.44,0.44,0.441,0.441,0.467


In [17]:
# checkink if all values are not null
education.notna().any()

Unnamed: 0    True
Country       True
1990          True
1991          True
1992          True
1993          True
1994          True
1995          True
1996          True
1997          True
1998          True
1999          True
2000          True
2001          True
2002          True
2003          True
2004          True
2005          True
2006          True
2007          True
2008          True
2009          True
2010          True
2011          True
2012          True
2013          True
2014          True
2015          True
2016          True
2017          True
2018          True
2019          True
dtype: bool

In [27]:
# top 5 countries with highest education index
education.set_index('Country').loc[:,'2019'].nlargest(5)

Country
Germany           0.943
China             0.937
Norway            0.930
United Kingdom    0.928
Finland           0.927
Name: 2019, dtype: float64

In [None]:
# defining a function to find growth rate in education index for a country between two time periods
def growth_rate(start_year, end_year, country):
    index = education[education.Country == country].index[0]
    return (education.at[index, str(end_year)] - education.at[index, str(start_year)]) / education.at[index, str(end_year)] * 100

In [None]:
growth_rate(2010, 2019, 'India')

In [None]:
# top 5 countries with highest growth rate between 2010 and 2019
rate_series = pd.Series([growth_rate(2010, 2019, country) for country in education.Country],
                       index = education.Country)
rate_series.nlargest(5)

In [None]:
education[education.Country == 'India']