In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
internet_df = pd.read_csv('../data/internet_use.csv.csv', nrows = 4496 , engine = 'python')

In [None]:
internet_df.head()

In [None]:
internet_df.tail()

In [None]:
internet_df.shape

In [None]:
internet_df.dtypes

#### Drop the `Value Footnotes` column and rename the remaining three to 'Country', 'Year', and 'Internet_Users_Pct'.

In [None]:
internet_df = internet_df.drop(columns = ['Value Footnotes'])
internet_df = internet_df.rename(columns = {'Country or Area' : 'Country', 'Value' : 'Internet_Users_Pct'})
internet_df = internet_df.round({'Internet_Users_Pct':2})
internet_df

#### Look at the number of observations in this dataset per year. What do you notice?

There was a significant drop off in 1990.

In [None]:
internet_df.Year.value_counts().head()

In [None]:
years_counted = internet_df.Year.value_counts().head(43)

In [None]:
years_counted = years_counted.to_frame()

In [None]:
years_counted = years_counted.reset_index()

In [None]:
years_counted = years_counted.rename(columns = {'index' : 'Year', 'Year' : 'Counts'})

In [None]:
years_counted.sort_values(by = 'Year', ascending = False)

#### What is the first year to have a non-zero internet users percentage value?

1990

In [None]:
not_zero = internet_df.loc[internet_df['Internet_Users_Pct'] > 0]

In [None]:
not_zero.sort_values('Year', ascending = True).head(18)

#### How does the distribution of internet users percent differ for 2000 and 2014?

In [None]:
my_list = [2000, 2014]
multi_year = internet_df.set_index('Year').loc[my_list].reset_index()
multi_year

In [None]:
sns.histplot(data = multi_year, x = 'Year', y = 'Internet_Users_Pct')
plt.show()

In [None]:
users_2000 = internet_df.loc[internet_df['Year'] == 2000]

In [None]:
users_2014 = internet_df.loc[internet_df['Year'] == 2014]

In [None]:
sns.histplot(data = users_2000, x ='Year', y = 'Internet_Users_Pct', color = 'skyblue', label = '2000')
sns.histplot(data = users_2014, x = 'Year', y = 'Internet_Users_Pct', color = 'red', label = '2014')
plt.show()

#### For how many countries was the percentage of internet users below 5% in 2014?

In [None]:
users_2014.loc[users_2014.Internet_Users_Pct < 5].shape

#### Merge the two DataFrames to one. Do this in a way that keeps **all rows** from each of the two DataFrames. Call the new DataFrame `gdp_and_internet_use`. Look at the first and last few rows to confirm that it merged correctly.

In [None]:
internet_df

In [None]:
gdp_df = pd.read_csv('../data/gdp_percapita.csv.csv')
gdp_df.head(2)

In [None]:
gdp_and_internet_use = pd.merge(internet_df, gdp_df, left_on =['Country'], right_on =['Country or Area'], 
                                how = 'inner')
gdp_and_internet_use = gdp_and_internet_use.round({'Value':2})
gdp_and_internet_use

#### Find the three countries with the highest internet users percentage in 2014. Use a seaborn FacetGrid (https://seaborn.pydata.org/generated/seaborn.FacetGrid.html) to compare how the GDP per capita has changed over time for these three countries. What do you notice?

In [None]:
gdp_and_internet_use_2014 = gdp_and_internet_use.loc[gdp_and_internet_use['Year_x'] == 2014]
gdp_and_internet_use_2014 = gdp_and_internet_use_2014.drop_duplicates (['Country'])
gdp_and_internet_use_2014 = gdp_and_internet_use_2014.reset_index(drop = True)
gdp_and_internet_use_2014

In [None]:
gdp_and_internet_use_2014 = gdp_and_internet_use_2014[['Country', 'Internet_Users_Pct', 'Year_x']]
gdp_and_internet_use_2014

##### Highest internet users for 2014

In [None]:
gdp_and_internet_use_2014.sort_values('Internet_Users_Pct', ascending = False).head(3)

In [None]:
gdp_and_internet_use_2014.nlargest(3, 'Internet_Users_Pct')

In [None]:
country_list = ['Iceland', 'Bermuda', 'Norway']
country_list = gdp_and_internet_use.set_index('Country').loc[country_list].reset_index()
country_list = country_list[['Country or Area', 'Year_y', 'Value']]
country_list = country_list.drop_duplicates(['Value'])
country_list = country_list.rename(columns = {'Year_y' : 'Year', 'Value' : 'GDP_Per_Capita'})
country_list.head()

##### Top 3 countries highest internet users GDP change over time

In [None]:
g = sns.FacetGrid(country_list, col = "Country or Area", margin_titles = False, height = 4)
g.map(plt.plot, 'Year', 'GDP_Per_Capita')
plt.show()

Around 2007, each country had a significant jump in their GDP_Per_Capita.

In [None]:
gdp_and_internet_use_2014

####  Subset `gdp_and_internet_use` to just the year 2014. Save this as a new dataframe named `gdp_and_internet_use_2014`.

In [None]:
gdp_and_internet_use_2014 = gdp_and_internet_use.loc[gdp_and_internet_use['Year_x'] == 2014]
gdp_and_internet_use_2014 = gdp_and_internet_use.loc[gdp_and_internet_use['Year_y'] == 2014]
gdp_and_internet_use_2014 = gdp_and_internet_use_2014.drop_duplicates (['Country'])
gdp_and_internet_use_2014 = gdp_and_internet_use_2014.rename(columns = {'Value' : 'GDP_Per_Capita'})
gdp_and_internet_use_2014 = gdp_and_internet_use_2014.reset_index(drop = True)
gdp_and_internet_use_2014

####  Create a plot which compares Internet Users Percentage and GDP per Capita for the year 2014. What do you notice from this plot? If you see any unusual points, investigate them.

In [None]:
sns.regplot(x=gdp_and_internet_use_2014['Internet_Users_Pct'], y=gdp_and_internet_use_2014['GDP_Per_Capita'])
plt.show()

In [None]:
gdp_and_internet_use_2014.sort_values('GDP_Per_Capita', ascending = False).head(5)

In [None]:
gdp_and_internet_use_2014.sort_values('Internet_Users_Pct', ascending = False).head(5)

In [None]:
gdp_and_internet_use_2014.sort_values('Internet_Users_Pct', ascending = False).reset_index(drop = True).head(75)

Luxembourg has the highest GDP and is in the top 5 for Internet users. San Marino hovers right around the median for both GDP_Per_Capita and Internet_Users_Pct.

#### **Stretch Question:** Use the `qcut` function from pandas (https://pandas.pydata.org/docs/reference/api/pandas.qcut.html) to divide countries in `gdp_per_capita_2014` into three groups based on their GDP per capita values. Label these groups as "Low", "Medium", and "High". Put these labels in a new column, named "GDP_group".

In [None]:
ranges = [0, 10000, 20000, 40000, 60000, 140000]
GDP_group = ['Low', 'Medium', 'High']

In [None]:
pd.qcut(gdp_and_internet_use_2014.GDP_Per_Capita, q=6)