In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

**First Six Rows in the GDP Dataset**

In [3]:
gdp_df = pd.read_csv('../Data/gdp_percapita.csv', nrows = 6868)
gdp_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2019,2065.036398,
1,Afghanistan,2018,2033.779002,
2,Afghanistan,2017,2058.383832,
3,Afghanistan,2016,2057.062164,
4,Afghanistan,2015,2068.265904,
5,Afghanistan,2014,2102.385234,


**Shape of the GDP Dataset**

In [4]:
gdp_df.shape

(6868, 4)

There are 6868 rows and 4 columns

**First Six Rows in the Internet Use Dataset**

In [5]:
internet_df = pd.read_csv('../Data/internet_use.csv', nrows = 4495)
internet_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2014,6.39,
1,Afghanistan,2013,5.9,
2,Afghanistan,2012,5.454545,
3,Afghanistan,2011,5.0,
4,Afghanistan,2010,4.0,
5,Afghanistan,2009,3.55,


**Shape of the Internet Use Dataset**

In [6]:
internet_df.shape

(4495, 4)

There are 4495 rows and 4 columns

**Datatypes**

&emsp;&emsp;GDP

In [7]:
gdp_df.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

&emsp;&emsp;Internet Use

In [8]:
internet_df.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

**Last 10 Rows of Each Dataframe**

&emsp;&emsp;GDP

In [9]:
gdp_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6858,Zimbabwe,1999,3600.849291,
6859,Zimbabwe,1998,3653.920016,
6860,Zimbabwe,1997,3580.048793,
6861,Zimbabwe,1996,3520.430146,
6862,Zimbabwe,1995,3226.41393,
6863,Zimbabwe,1994,3263.934978,
6864,Zimbabwe,1993,3033.504852,
6865,Zimbabwe,1992,3054.889178,
6866,Zimbabwe,1991,3426.598094,
6867,Zimbabwe,1990,3324.348171,


&emsp;&emsp;Internet Use

In [10]:
internet_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
4485,Zimbabwe,2002,3.994356,
4486,Zimbabwe,2001,0.799846,
4487,Zimbabwe,2000,0.401434,
4488,Zimbabwe,1999,0.161676,
4489,Zimbabwe,1998,0.081648,
4490,Zimbabwe,1997,0.03308,
4491,Zimbabwe,1996,0.01679,
4492,Zimbabwe,1995,0.007684,
4493,Zimbabwe,1994,0.001739,
4494,Zimbabwe,1990,0.0,


**Remove the Value Footnotes from each Dataframe**

&emsp;&emsp;GDP

In [11]:
gdp_no_footnotes = gdp_df.drop(columns = ['Value Footnotes'])
gdp_no_footnotes

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2019,2065.036398
1,Afghanistan,2018,2033.779002
2,Afghanistan,2017,2058.383832
3,Afghanistan,2016,2057.062164
4,Afghanistan,2015,2068.265904
...,...,...,...
6863,Zimbabwe,1994,3263.934978
6864,Zimbabwe,1993,3033.504852
6865,Zimbabwe,1992,3054.889178
6866,Zimbabwe,1991,3426.598094


In [12]:
internet_no_footnotes = internet_df.drop(columns=['Value Footnotes'])
internet_no_footnotes

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2014,6.390000
1,Afghanistan,2013,5.900000
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.000000
4,Afghanistan,2010,4.000000
...,...,...,...
4490,Zimbabwe,1997,0.033080
4491,Zimbabwe,1996,0.016790
4492,Zimbabwe,1995,0.007684
4493,Zimbabwe,1994,0.001739


**Change the Column Names**

&emsp;&emsp;GDP

In [13]:
gdp_no_footnotes = gdp_no_footnotes.rename(columns = {'Country or Area': 'Country', 'Value':'GDP_Per_Capita'})
gdp_no_footnotes

Unnamed: 0,Country,Year,GDP_Per_Capita
0,Afghanistan,2019,2065.036398
1,Afghanistan,2018,2033.779002
2,Afghanistan,2017,2058.383832
3,Afghanistan,2016,2057.062164
4,Afghanistan,2015,2068.265904
...,...,...,...
6863,Zimbabwe,1994,3263.934978
6864,Zimbabwe,1993,3033.504852
6865,Zimbabwe,1992,3054.889178
6866,Zimbabwe,1991,3426.598094


&emsp;&emsp;Internet

In [14]:
internet_no_footnotes = internet_no_footnotes.rename(columns={'Country or Area':'Country','Value':'Internet_Users_Pct'})
internet_no_footnotes

Unnamed: 0,Country,Year,Internet_Users_Pct
0,Afghanistan,2014,6.390000
1,Afghanistan,2013,5.900000
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.000000
4,Afghanistan,2010,4.000000
...,...,...,...
4490,Zimbabwe,1997,0.033080
4491,Zimbabwe,1996,0.016790
4492,Zimbabwe,1995,0.007684
4493,Zimbabwe,1994,0.001739


**Merge the Dataframes**

In [15]:
gdp_and_internet_use = pd.merge(gdp_no_footnotes, 
                                internet_no_footnotes,
                                on = ['Country', 'Year'],
                                how = 'outer')
gdp_and_internet_use

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2019,2065.036398,
1,Afghanistan,2018,2033.779002,
2,Afghanistan,2017,2058.383832,
3,Afghanistan,2016,2057.062164,
4,Afghanistan,2015,2068.265904,
...,...,...,...,...
7900,Yemen,1999,,0.056629
7901,Yemen,1998,,0.023323
7902,Yemen,1997,,0.015025
7903,Yemen,1996,,0.000621
