# An Exploration of UN Data (Gross Domestic Product and Internet Usage)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 4. Using the pandas `read_csv()` method, read the GDP dataset into your notebook as a DataFrame called `gdp_df`. Take a look at the first few and last few rows to familiarize yourself with what is contained in this dataset.

In [None]:
gdp_df = pd.read_csv(r"C:\Users\datan\OneDrive\Desktop\NSS\DDA14\Python\projects\gdp-and-internet-usage-data-nik\data\gdp_percapita.csv")

gdp_df

# 5. How many rows and columns are in `gdp_df`? What are the data types of each column?

In [None]:
gdp_df.shape

In [None]:
gdp_df.dtypes

# 6. Drop the `Value Footnotes` column and rename the remaining three to 'Country', 'Year', and 'GDP_Per_Capita'.

In [None]:
gdp_df = gdp_df.drop(columns = ['Value Footnotes'])

gdp_df = gdp_df.rename(columns = {'Country or Area': 'Country', 'Year': 'Year', 'Value': 'GDP_Per_Capita'})

gdp_df

# 7. How many countries have data for all years? Which countries are missing many years of data? Look at the number of observations per year. What do you notice?

Find unique years and group by country and count unique years of data

In [None]:
year_unique = gdp_df['Year'].unique()

in_tha_country = gdp_df.groupby('Country')['Year'].nunique()

in_tha_country

Filter countries having data for all years

In [None]:
countries_and_years = in_tha_country[in_tha_country == year_unique]

countries_and_years # Work in Progress

# 8. In this question, you're going to create some plots to show the distribution of GDP per capita for the year 2020. Create a histogram, a density plot, a boxplot, and a violin plot.

In [None]:
gdp_2020 = gdp_df.loc[(gdp_df.Year == 2020)]

gdp_2020

Histogram

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(gdp_2020['GDP_Per_Capita'], bins=10)
plt.title('Histogram of GDP per Capita for the Year 2020')
plt.xlabel('GDP per Capita')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.histplot(x = gdp_2020['Year'], y = gdp_2020['GDP_Per_Capita'])
plt.show()

Density Plot

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(gdp_2020['GDP_Per_Capita'], shade=True)
plt.title('Density Plot of GDP per Capita for the Year 2020')
plt.xlabel('GDP per Capita')
plt.ylabel('Density')
plt.show()

Boxplot

In [None]:
sns.boxplot(x = gdp_2020['Year'], y = gdp_2020['GDP_Per_Capita'])
plt.show()

Violin Plot

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x=gdp_2020['GDP_Per_Capita'])
plt.title('Violin Plot of GDP per Capita for the Year 2020')
plt.xlabel('GDP per Capita')
plt.show()

In [None]:
sns.violinplot(x = gdp_2020['Year'], y = gdp_2020['GDP_Per_Capita'])
plt.show()

# 9. What was the median GDP per capita value in 2020?

In [None]:
median_per_capita_2020 = gdp_2020['GDP_Per_Capita'].median()

median_per_capita_2020

# 10. Create some visualizations to compare GDP per capita values for the years 1990, 2000, 2010, and 2020. Start by subsetting your data to just these 4 years into a new DataFrame named gdp_decades. Using this, create the following 4 plots:* A boxplo, A barplot, A scatterplot, and A scatterplot with a trendline overlaid.t


In [None]:
gdp_decades = gdp_df.loc[(gdp_df.Year == 1990) | (gdp_df.Year == 2000) | (gdp_df.Year == 2010) | (gdp_df.Year == 2020)]

Boxplot

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot(gdp_decades['GDP_Per_Capita'])
plt.title('GDP per capita values')
plt.ylabel('GDP_Per_Capita')
plt.xticks([1], ['Year']);

In [None]:
sns.boxplot(x = gdp_decades['Year'], y = gdp_2020['GDP_Per_Capita'])
plt.show()

Barplot

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(gdp_decades['Year'], gdp_decades['GDP_Per_Capita'])
plt.title('GDP per capita values')
plt.xlabel('Year')
plt.ylabel('GDP');

Scatterplot

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(gdp_decades['Year'], gdp_decades['GDP_Per_Capita'])
plt.title('GDP per capita values')
plt.xlabel('Year')
plt.ylabel('GDP');

Scatterplot with trendline

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x='Year', y='GDP_Per_Capita', data=gdp_decades, ci=None)
plt.title('GDP per capita values')
plt.xlabel('Year')
plt.ylabel('GDP');

# 11. Which country was the first to have a GDP per capita greater than $100,000?

 Filter rows with FGP per capita greater than $100,000

In [None]:
one_hundred_k = gdp_df[gdp_df['GDP_Per_Capita'] > 100000]

Sort by year in ascending order

In [None]:
get_sorted = one_hundred_k.sort_values(by='Year')

get_sorted

# 12. Which country had the highest GDP per capita in 2020? Create a plot showing how this country's GDP per capita has changed over the timespan of the dataset.

In [None]:
big_gdp = gdp_2020.loc[gdp_2020['GDP_Per_Capita'].idxmax()] 

big_gdp

In [None]:
country_name = big_gdp['Country']

country_data = gdp_df[gdp_df['Country'] == country_name]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(country_data['Year'], country_data['GDP_Per_Capita'], marker='o')
plt.title(f'GDP per Capita Over Time for {country_name}')
plt.xlabel('Year')
plt.ylabel('GDP per Capita')
plt.grid(True);

# 13. Which country had the lowest GDP per capita in 2020? Create a plot showing how this country's GDP per capita has changed over the timespan of the dataset.

In [None]:
little_gdp = gdp_2020.loc[gdp_2020['GDP_Per_Capita'].idxmin()] 

In [None]:
country_name = little_gdp['Country']

country_data = gdp_df[gdp_df['Country'] == country_name]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(country_data['Year'], country_data['GDP_Per_Capita'], marker='o')
plt.title(f'GDP per Capita Over Time for {country_name}')
plt.xlabel('Year')
plt.ylabel('GDP per Capita')
plt.grid(True);

#  **Bonus question:** Is it true in general that coutries had a higher GDP per capita in 2020 than in 1990? Which countries had lower GDP per capita in 2020 than in 1990?

In [None]:
# Coming back to this later! 

# 14. Read in the internet use dataset into a DataFrame named `internet_df`.

In [None]:
internet_df = pd.read_csv(r"C:\Users\datan\OneDrive\Desktop\NSS\DDA14\Python\projects\gdp-and-internet-usage-data-nik\data\internet_use.csv")

internet_df.head(10)

In [None]:
internet_df.dtypes

# 15. Drop the `Value Footnotes` column and rename the remaining three to 'Country', 'Year', and 'Internet_Users_Pct'.

In [None]:
internet_df = internet_df.drop(columns = ['Value Footnotes'])

internet_df = internet_df.rename(columns = {'Country or Area': 'Country', 'Year': 'Year', 'Value': 'Internet_Users_Pct'})

internet_df

# 16. Look at the number of observations in this dataset per year. What do you notice?

In [None]:
year_observation = internet_df.groupby('Year').size()

year_observation

# 17. What is the first year to have a non-zero internet users percentage value?

In [None]:
internet_df['Internet_Users_Pct'] = pd.to_numeric(internet_df['Internet_Users_Pct']) # Work in progress

In [None]:
no_zero = internet_df[internet_df['Internet_Users_Pct'] > 0].sort_values('year').iloc[0] # Work in progress

no_zero

# 18. How does the distribution of internet users percent differ for 2000 and 2014?

In [None]:
i2000 = internet_df[internet_df['Year'] == 2000] # Work in progress
i2014 = internet_df[internet_df['Year'] == 2014]

In [None]:
i2000

# 19. For how many countries was the percentage of internet users below 5% in 2014?

# 20. Merge the two DataFrames to one. Do this in a way that keeps **all rows** from each of the two DataFrames. Call the new DataFrame `gdp_and_internet_use`. Look at the first and last few rows to confirm that it merged correctly.