#  UN Data Analysis Project

Analysis on UN Data comparing gdp per capita and internet usage.

### Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Formatting

In [2]:
%matplotlib inline

### Load Data

In [3]:
gdp_df = pd.read_csv('../data/gdp_percapita.csv', nrows=6729)

In [4]:
internet_df = pd.read_csv('../data/internet_use.csv', nrows=4495)

### Answering Questions

##### 7. Look at the shape of each dataframe - how many rows, how many columns.

In [5]:
gdp_df.shape

# 6731 rows, 4 columns

(6729, 4)

In [6]:
internet_df.shape

# 4497 rows, 4 columns

(4495, 4)

##### 8. Take a look at the data types for the columns in each table.

In [7]:
type(gdp_df)

pandas.core.frame.DataFrame

In [8]:
type(internet_df)

pandas.core.frame.DataFrame

##### 9. Take a look at the last 10 rows of each dataset in turn.

In [9]:
print(gdp_df.tail(10))

     Country or Area  Year        Value  Value Footnotes
6719        Zimbabwe  1999  3054.064189              NaN
6720        Zimbabwe  1998  3099.076182              NaN
6721        Zimbabwe  1997  3036.422224              NaN
6722        Zimbabwe  1996  2985.856605              NaN
6723        Zimbabwe  1995  2736.486436              NaN
6724        Zimbabwe  1994  2768.309953              NaN
6725        Zimbabwe  1993  2572.870395              NaN
6726        Zimbabwe  1992  2591.007534              NaN
6727        Zimbabwe  1991  2906.272849              NaN
6728        Zimbabwe  1990  2819.549467              NaN


In [10]:
print(internet_df.tail(10))

     Country or Area  Year     Value  Value Footnotes
4485        Zimbabwe  2002  3.994356              NaN
4486        Zimbabwe  2001  0.799846              NaN
4487        Zimbabwe  2000  0.401434              NaN
4488        Zimbabwe  1999  0.161676              NaN
4489        Zimbabwe  1998  0.081648              NaN
4490        Zimbabwe  1997  0.033080              NaN
4491        Zimbabwe  1996  0.016790              NaN
4492        Zimbabwe  1995  0.007684              NaN
4493        Zimbabwe  1994  0.001739              NaN
4494        Zimbabwe  1990  0.000000              NaN


##### 10. Drop the 'value footnotes' data (column) from both datasets. Check that this worked as expected.

In [11]:
del gdp_df['Value Footnotes']

In [12]:
gdp_df.columns

Index(['Country or Area', 'Year', 'Value'], dtype='object')

In [13]:
del internet_df['Value Footnotes']

In [14]:
internet_df.columns

Index(['Country or Area', 'Year', 'Value'], dtype='object')

##### 11. Change the columns for the GDP Per Capita data frame to ‘Country’, ‘Year’, and ‘GDP_Per_Capita’.

In [15]:
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']

In [16]:
gdp_df.columns

Index(['Country', 'Year', 'GDP_Per_Capita'], dtype='object')

##### 12. Change the columns for the Internet Users data frame to ‘Country’, ‘Year’, and ‘Internet_Users_Pct’.

In [17]:
internet_df.columns = ['Country', 'Year', 'Internet_Users_Pct']

In [18]:
internet_df.columns

Index(['Country', 'Year', 'Internet_Users_Pct'], dtype='object')

##### 13. Merge the two DataFrames to one. Merge all rows from each of the two DataFrames. Call the new DataFrame gdp_and_internet_use.

In [19]:
print(gdp_df.dtypes)
print(internet_df.dtypes)

Country            object
Year                int64
GDP_Per_Capita    float64
dtype: object
Country                object
Year                    int64
Internet_Users_Pct    float64
dtype: object


In [20]:
gdp_and_internet_use = pd.merge(gdp_df, internet_df,
                               on = ['Country', 'Year'], how = 'outer')

##### 14. Look at the first five rows of your new data frame to confirm it merged correctly.

In [21]:
gdp_and_internet_use.head()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2018,1734.723214,
1,Afghanistan,2017,1758.465636,
2,Afghanistan,2016,1757.02349,
3,Afghanistan,2015,1766.593077,
4,Afghanistan,2014,1795.735834,6.39


##### 15. Look at the last five rows to make sure the data is clean and as expected.

In [22]:
gdp_and_internet_use.tail()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
7700,Virgin Islands (U.S.),1997,,6.948369
7701,Virgin Islands (U.S.),1996,,4.647186
7702,Virgin Islands (U.S.),1995,,2.801958
7703,Virgin Islands (U.S.),1994,,0.940645
7704,Virgin Islands (U.S.),1990,,0.0


In [26]:
gdp_and_internet_use.shape

(7705, 4)

##### 16. Subset the combined data frame to keep only the data for 2004, 2009, and 2014. Check that this happened correctly.

In [23]:
gdp_and_internet_use_subset = gdp_and_internet_use[gdp_and_internet_use["Year"].isin([2004,2009,2014])]

In [24]:
print(gdp_and_internet_use_subset)

                    Country  Year  GDP_Per_Capita  Internet_Users_Pct
4               Afghanistan  2014     1795.735834            6.390000
9               Afghanistan  2009     1502.354073            3.550000
14              Afghanistan  2004     1025.208245            0.105809
21                  Albania  2014    10700.993216           60.100000
26                  Albania  2009     9525.377979           41.200000
...                     ...   ...             ...                 ...
7668               Viet Nam  2009             NaN           26.550000
7673               Viet Nam  2004             NaN            7.642409
7683  Virgin Islands (U.S.)  2014             NaN           50.070000
7688  Virgin Islands (U.S.)  2009             NaN           27.396510
7693  Virgin Islands (U.S.)  2004             NaN           27.377009

[835 rows x 4 columns]


In [27]:
gdp_and_internet_use_subset.head()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
4,Afghanistan,2014,1795.735834,6.39
9,Afghanistan,2009,1502.354073,3.55
14,Afghanistan,2004,1025.208245,0.105809
21,Albania,2014,10700.993216,60.1
26,Albania,2009,9525.377979,41.2


In [29]:
gdp_and_internet_use_subset = gdp_and_internet_use_subset.reset_index(drop = True)
gdp_and_internet_use_subset.head()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2014,1795.735834,6.39
1,Afghanistan,2009,1502.354073,3.55
2,Afghanistan,2004,1025.208245,0.105809
3,Albania,2014,10700.993216,60.1
4,Albania,2009,9525.377979,41.2


##### 17. Create three new data frames, one for 2004, one for 2009, and one for 2014. Give them meaningful names that aren't too long.