In [138]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.font_manager
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
import plotly.express as px
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.cm as cm
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount ('/content/drive')

In [None]:
%load_ext google.colab.data_table

In [141]:
# Is there a relation between a country's Gross Domestrict Product (GDP) and its income inequality?
# Be aware of the difference between correlation and causation here. A might cause B. B might cause A. But both A and B could be caused by an unknown C as well.
# One way to express income inequality is to look at a country's "Gini coefficient" (also known as "Gini index"). You can find a dataset of Gini Coefficients here(opens in a new tab).
# You can find a dataset with historical GDP data here(opens in a new tab).
# To be able to answer this question you would want to calculate the "correlation coefficient" of the GDP and the Gini coefficient. But before you can do that you may need to resample the data so a correlation coefficient can be calculated.


In [None]:
# Load dataframe1
income = pd.read_csv("/content/drive/MyDrive/RawData/inequality.csv", encoding='latin-1')
income

In [None]:
# Filter the columns I want to use dataframe1
income_filtered = income.loc [0:,"Country":"Gini coefficient (before tax) (World Inequality Database)"]
income_filtered

In [None]:
# Load dataframe2
gdp = pd.read_csv("/content/drive/MyDrive/RawData/gdp-per-capita-maddison.csv", encoding='latin-1')
gdp

In [None]:
# Filter the columns I want to use dataframe2
gdp_filtered = gdp.loc [0:,"Entity":"GDP per capita"]
gdp_filtered

In [None]:
# Rename column to match the columnames in both dataframes
gdp_filtered = gdp_filtered.rename(columns={'Entity': 'Country'})
gdp_filtered

In [None]:
# I want to merge the two data frames while keeping the rows of both data frames based on one or more common columns.
merge_result = pd.merge(gdp_filtered, income_filtered, on=['Country', 'Year'], how='inner')
merge_result

In [None]:
# Filter out the NaN
result_filtered = merge_result.dropna()
result_filtered

In [None]:
# Rename columnname
result_filtered.rename(columns={'Gini coefficient (before tax) (World Inequality Database)': 'Gini coefficient'}, inplace=True)
result_filtered

In [None]:
# Set the 'Country' and 'Year' columns as the index of the dataframe
result_filtered.set_index(['Country', 'Year'], inplace=True)
result_filtered

In [151]:
# Initialize an empty dataframe to store the results
correlation_result = pd.DataFrame(columns=['Country', 'Year', 'Pearson Correlation'])

In [152]:
# Group the data by 'Country'
grouped = result_filtered.groupby('Country')

In [None]:
# Loop through each country and calculate the Pearson Correlation per year
for country, group in grouped:
    correlation = group['GDP per capita'].corr(group['Gini coefficient'], method='pearson')
    years = group.index.get_level_values('Year')
    correlation_result = pd.concat([correlation_result, pd.DataFrame({'Country': country, 'Year': years, 'Pearson Correlation': correlation}).reset_index(drop=True)])

In [154]:
# Reset the index of the resulting dataframe
correlation_result.reset_index(drop=True, inplace=True)

In [None]:
# Display the resulting dataFrame with Pearson Correlations
correlation_result

In [None]:
# Find the most common year in the dataframe
most_common_year = correlation_result['Year'].mode().iloc[0]
most_common_year

In [157]:
# Create a new dataframe with only the Pearson Correlation for the most common year
filtered_result = correlation_result[correlation_result['Year'] == most_common_year][['Country', 'Year', 'Pearson Correlation']]

In [158]:
# Merge with the original dataframe to include 'GDP per capita' and 'Gini coefficient'
filtered_result = pd.merge(filtered_result, result_filtered, on=['Country', 'Year'], how='inner')

In [159]:
# Reorder columns for better readability
filtered_result = filtered_result[['Country', 'Year', 'GDP per capita', 'Gini coefficient', 'Pearson Correlation']]

In [None]:
# Display the resulting dataFrame
filtered_result

In [None]:
# Scatter plot of GDP per capita vs. Gini coefficient
plt.figure(figsize=(25, 15))

scatter = plt.scatter(filtered_result['GDP per capita'], filtered_result['Gini coefficient'], alpha=0.8, s=50)

# Add country labels
for i, country in enumerate(filtered_result['Country']):
    plt.annotate(country, (filtered_result['GDP per capita'].iloc[i], filtered_result['Gini coefficient'].iloc[i]), fontsize=11)

# Plot the diagonal line for perfect correlation
plt.plot([filtered_result['GDP per capita'].min(), filtered_result['GDP per capita'].max()],
         [filtered_result['Gini coefficient'].min(), filtered_result['Gini coefficient'].max()],
         linestyle='--', color='red', alpha=0.5)

plt.title('GDP per capita vs. Gini coefficient in 2010')
plt.xlabel('GDP per capita')
plt.ylabel('Gini coefficient')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()