In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
# Loading gdp dataset to pd, showing the shape of the df, 6868 rows and 4 columns
gdp = pd.read_csv('../data/gdp_percapita.csv', engine = 'python', skipfooter = 2)
print(gdp.head(6))
gdp.shape

  Country or Area  Year        Value  Value Footnotes
0     Afghanistan  2019  2065.036398              NaN
1     Afghanistan  2018  2033.779002              NaN
2     Afghanistan  2017  2058.383832              NaN
3     Afghanistan  2016  2057.062164              NaN
4     Afghanistan  2015  2068.265904              NaN
5     Afghanistan  2014  2102.385234              NaN


(6868, 4)

In [4]:
# checking the first and last 5 of gdp.df
gdp

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2019,2065.036398,
1,Afghanistan,2018,2033.779002,
2,Afghanistan,2017,2058.383832,
3,Afghanistan,2016,2057.062164,
4,Afghanistan,2015,2068.265904,
...,...,...,...,...
6863,Zimbabwe,1994,3263.934978,
6864,Zimbabwe,1993,3033.504852,
6865,Zimbabwe,1992,3054.889178,
6866,Zimbabwe,1991,3426.598094,


In [5]:
# Loading internet dataset to pd, showing the shape of the df, 4495 rows and 4 columns
internet = pd.read_csv('../data/internet_use.csv', nrows = 4495)
print(internet.tail(6))
internet.shape

     Country or Area  Year     Value  Value Footnotes
4489        Zimbabwe  1998  0.081648              NaN
4490        Zimbabwe  1997  0.033080              NaN
4491        Zimbabwe  1996  0.016790              NaN
4492        Zimbabwe  1995  0.007684              NaN
4493        Zimbabwe  1994  0.001739              NaN
4494        Zimbabwe  1990  0.000000              NaN


(4495, 4)

In [6]:
# checking the internet.df, first and last 5
internet

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2014,6.390000,
1,Afghanistan,2013,5.900000,
2,Afghanistan,2012,5.454545,
3,Afghanistan,2011,5.000000,
4,Afghanistan,2010,4.000000,
...,...,...,...,...
4490,Zimbabwe,1997,0.033080,
4491,Zimbabwe,1996,0.016790,
4492,Zimbabwe,1995,0.007684,
4493,Zimbabwe,1994,0.001739,


In [7]:
# Datatypes for gdp.df
gdp.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

In [8]:
# Datatypes  for internet.df
internet.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

In [9]:
# Last 10 rows for gdp.df
gdp.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6858,Zimbabwe,1999,3600.849291,
6859,Zimbabwe,1998,3653.920016,
6860,Zimbabwe,1997,3580.048793,
6861,Zimbabwe,1996,3520.430146,
6862,Zimbabwe,1995,3226.41393,
6863,Zimbabwe,1994,3263.934978,
6864,Zimbabwe,1993,3033.504852,
6865,Zimbabwe,1992,3054.889178,
6866,Zimbabwe,1991,3426.598094,
6867,Zimbabwe,1990,3324.348171,


In [10]:
# Last 10 rows for internet.df
internet.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
4485,Zimbabwe,2002,3.994356,
4486,Zimbabwe,2001,0.799846,
4487,Zimbabwe,2000,0.401434,
4488,Zimbabwe,1999,0.161676,
4489,Zimbabwe,1998,0.081648,
4490,Zimbabwe,1997,0.03308,
4491,Zimbabwe,1996,0.01679,
4492,Zimbabwe,1995,0.007684,
4493,Zimbabwe,1994,0.001739,
4494,Zimbabwe,1990,0.0,


In [11]:
# Dropping 'value footnotes' from gdp.df
gdp = gdp.drop('Value Footnotes', axis = 1)
gdp

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2019,2065.036398
1,Afghanistan,2018,2033.779002
2,Afghanistan,2017,2058.383832
3,Afghanistan,2016,2057.062164
4,Afghanistan,2015,2068.265904
...,...,...,...
6863,Zimbabwe,1994,3263.934978
6864,Zimbabwe,1993,3033.504852
6865,Zimbabwe,1992,3054.889178
6866,Zimbabwe,1991,3426.598094


In [12]:
# Dropping 'value footnotes' from internet.df
internet = internet.drop('Value Footnotes', axis = 1)
internet

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2014,6.390000
1,Afghanistan,2013,5.900000
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.000000
4,Afghanistan,2010,4.000000
...,...,...,...
4490,Zimbabwe,1997,0.033080
4491,Zimbabwe,1996,0.016790
4492,Zimbabwe,1995,0.007684
4493,Zimbabwe,1994,0.001739


In [13]:
# Renaming columns in gdp.df
gdp.columns = [ 'Country', 'Year', 'GDP_Per_Capita' ]
gdp

Unnamed: 0,Country,Year,GDP_Per_Capita
0,Afghanistan,2019,2065.036398
1,Afghanistan,2018,2033.779002
2,Afghanistan,2017,2058.383832
3,Afghanistan,2016,2057.062164
4,Afghanistan,2015,2068.265904
...,...,...,...
6863,Zimbabwe,1994,3263.934978
6864,Zimbabwe,1993,3033.504852
6865,Zimbabwe,1992,3054.889178
6866,Zimbabwe,1991,3426.598094


In [14]:
# Renaming columns in internet.df
internet.columns = ['Country', 'Year', 'Internet_Users_Pct']
internet

Unnamed: 0,Country,Year,Internet_Users_Pct
0,Afghanistan,2014,6.390000
1,Afghanistan,2013,5.900000
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.000000
4,Afghanistan,2010,4.000000
...,...,...,...
4490,Zimbabwe,1997,0.033080
4491,Zimbabwe,1996,0.016790
4492,Zimbabwe,1995,0.007684
4493,Zimbabwe,1994,0.001739


In [15]:
# merging
gdp_and_internet_use = gdp.merge(internet, how = 'outer', on = ['Country', 'Year'])
gdp_and_internet_use

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2019,2065.036398,
1,Afghanistan,2018,2033.779002,
2,Afghanistan,2017,2058.383832,
3,Afghanistan,2016,2057.062164,
4,Afghanistan,2015,2068.265904,
...,...,...,...,...
7900,Yemen,1999,,0.056629
7901,Yemen,1998,,0.023323
7902,Yemen,1997,,0.015025
7903,Yemen,1996,,0.000621


In [16]:
# Reset index after merge
gdp_and_internet_use.reset_index(drop = True)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2019,2065.036398,
1,Afghanistan,2018,2033.779002,
2,Afghanistan,2017,2058.383832,
3,Afghanistan,2016,2057.062164,
4,Afghanistan,2015,2068.265904,
...,...,...,...,...
7900,Yemen,1999,,0.056629
7901,Yemen,1998,,0.023323
7902,Yemen,1997,,0.015025
7903,Yemen,1996,,0.000621


In [17]:
gdp_and_internet_use.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7905 entries, 0 to 7904
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Country             7905 non-null   object 
 1   Year                7905 non-null   int64  
 2   GDP_Per_Capita      6868 non-null   float64
 3   Internet_Users_Pct  4495 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 308.8+ KB


In [18]:
gdp_and_internet_use.tail(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
7900,Yemen,1999,,0.056629
7901,Yemen,1998,,0.023323
7902,Yemen,1997,,0.015025
7903,Yemen,1996,,0.000621
7904,Yemen,1990,,0.0


In [19]:
# Subset to keep '2004, 2009, 2014'
chosen_years = gdp_and_internet_use[gdp_and_internet_use['Year'].isin([2004, 2009, 2014])]
chosen_years

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
5,Afghanistan,2014,2102.385234,6.390000
10,Afghanistan,2009,1758.904043,3.550000
15,Afghanistan,2004,1200.278256,0.105809
23,Albania,2014,11623.866679,60.100000
28,Albania,2009,10346.864199,41.200000
...,...,...,...,...
7868,Virgin Islands (U.S.),2009,,27.396510
7873,Virgin Islands (U.S.),2004,,27.377009
7885,Yemen,2014,,22.550000
7890,Yemen,2009,,9.960000


In [20]:
chosen_years.head(10)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
5,Afghanistan,2014,2102.385234,6.39
10,Afghanistan,2009,1758.904043,3.55
15,Afghanistan,2004,1200.278256,0.105809
23,Albania,2014,11623.866679,60.1
28,Albania,2009,10346.864199,41.2
33,Albania,2004,7604.838002,2.420388
53,Algeria,2014,11512.705405,18.09
58,Algeria,2009,10782.361398,11.23
63,Algeria,2004,10058.435939,4.634475
83,Angola,2014,8239.828837,21.26


In [21]:
# Reset after subset
chosen_years.reset_index(drop = True)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2014,2102.385234,6.390000
1,Afghanistan,2009,1758.904043,3.550000
2,Afghanistan,2004,1200.278256,0.105809
3,Albania,2014,11623.866679,60.100000
4,Albania,2009,10346.864199,41.200000
...,...,...,...,...
825,Virgin Islands (U.S.),2009,,27.396510
826,Virgin Islands (U.S.),2004,,27.377009
827,Yemen,2014,,22.550000
828,Yemen,2009,,9.960000


In [22]:
#2004 df
data_2004 = chosen_years[chosen_years['Year'].isin([2004])]
print(data_2004)


                    Country  Year  GDP_Per_Capita  Internet_Users_Pct
15              Afghanistan  2004     1200.278256            0.105809
33                  Albania  2004     7604.838002            2.420388
63                  Algeria  2004    10058.435939            4.634475
93                   Angola  2004     5593.241537            0.464815
123     Antigua and Barbuda  2004    18745.681534           24.266544
...                     ...   ...             ...                 ...
7771     T.F.Y.R. Macedonia  2004             NaN           24.440000
7829              Venezuela  2004             NaN            8.404470
7853               Viet Nam  2004             NaN            7.642409
7873  Virgin Islands (U.S.)  2004             NaN           27.377009
7895                  Yemen  2004             NaN            0.881223

[276 rows x 4 columns]


In [23]:
#Checking the data type
type(data_2004)

pandas.core.frame.DataFrame

In [24]:
# Reset after making new df
data_2004.reset_index(drop = True)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2004,1200.278256,0.105809
1,Albania,2004,7604.838002,2.420388
2,Algeria,2004,10058.435939,4.634475
3,Angola,2004,5593.241537,0.464815
4,Antigua and Barbuda,2004,18745.681534,24.266544
...,...,...,...,...
271,T.F.Y.R. Macedonia,2004,,24.440000
272,Venezuela,2004,,8.404470
273,Viet Nam,2004,,7.642409
274,Virgin Islands (U.S.),2004,,27.377009


In [25]:
# 2009 df
data_2009 = chosen_years[chosen_years['Year'].isin([2009])]
print(data_2009)

                    Country  Year  GDP_Per_Capita  Internet_Users_Pct
10              Afghanistan  2009     1758.904043             3.55000
28                  Albania  2009    10346.864199            41.20000
58                  Algeria  2009    10782.361398            11.23000
88                   Angola  2009     7643.493749             6.00000
118     Antigua and Barbuda  2009    20046.839017            42.00000
...                     ...   ...             ...                 ...
7766     T.F.Y.R. Macedonia  2009             NaN            51.77000
7824              Venezuela  2009             NaN            32.70000
7848               Viet Nam  2009             NaN            26.55000
7868  Virgin Islands (U.S.)  2009             NaN            27.39651
7890                  Yemen  2009             NaN             9.96000

[277 rows x 4 columns]


In [26]:
# checking data type
type(data_2009)

pandas.core.frame.DataFrame

In [27]:
data_2009.reset_index(drop = True)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2009,1758.904043,3.55000
1,Albania,2009,10346.864199,41.20000
2,Algeria,2009,10782.361398,11.23000
3,Angola,2009,7643.493749,6.00000
4,Antigua and Barbuda,2009,20046.839017,42.00000
...,...,...,...,...
272,T.F.Y.R. Macedonia,2009,,51.77000
273,Venezuela,2009,,32.70000
274,Viet Nam,2009,,26.55000
275,Virgin Islands (U.S.),2009,,27.39651


In [28]:
data_2014 = chosen_years[chosen_years['Year'].isin([2014])]
print(data_2014.head())


                 Country  Year  GDP_Per_Capita  Internet_Users_Pct
5            Afghanistan  2014     2102.385234                6.39
23               Albania  2014    11623.866679               60.10
53               Algeria  2014    11512.705405               18.09
83                Angola  2014     8239.828837               21.26
113  Antigua and Barbuda  2014    18104.241470               64.00


In [29]:
type(data_2014)

pandas.core.frame.DataFrame

In [30]:
data_2014.reset_index(drop = True)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2014,2102.385234,6.39
1,Albania,2014,11623.866679,60.10
2,Algeria,2014,11512.705405,18.09
3,Angola,2014,8239.828837,21.26
4,Antigua and Barbuda,2014,18104.241470,64.00
...,...,...,...,...
272,T.F.Y.R. Macedonia,2014,,68.06
273,Venezuela,2014,,57.00
274,Viet Nam,2014,,48.31
275,Virgin Islands (U.S.),2014,,50.07
