In [694]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.font_manager
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
import plotly.express as px
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.cm as cm

In [695]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [696]:
%load_ext google.colab.data_table

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


In [697]:
# Is there a relation between a country's Gross Domestrict Product (GDP) and its income inequality?
# Be aware of the difference between correlation and causation here. A might cause B. B might cause A. But both A and B could be caused by an unknown C as well.
# One way to express income inequality is to look at a country's "Gini coefficient" (also known as "Gini index"). You can find a dataset of Gini Coefficients here(opens in a new tab).
# You can find a dataset with historical GDP data here(opens in a new tab).
# To be able to answer this question you would want to calculate the "correlation coefficient" of the GDP and the Gini coefficient. But before you can do that you may need to resample the data so a correlation coefficient can be calculated.


In [None]:
# Load dataframe1
income = pd.read_csv("/content/drive/MyDrive/RawData/inequality.csv", encoding='latin-1')
income

In [None]:
# Filter the columns I want to use dataframe1
income_filtered = income.loc [0:,"Country":"Gini coefficient (before tax) (World Inequality Database)"]
income_filtered

In [None]:
# Load dataframe2
gdp = pd.read_csv("/content/drive/MyDrive/RawData/gdp-per-capita-maddison.csv", encoding='latin-1')
gdp

In [None]:
# Filter the columns I want to use dataframe2
gdp_filtered = gdp.loc [0:,"Entity":"GDP per capita"]
gdp_filtered

In [None]:
# Rename column to match the columnames in both dataframes
gdp_filtered = gdp_filtered.rename(columns={'Entity': 'Country'})
gdp_filtered

In [None]:
# I want to merge the two data frames while keeping the rows of both data frames based on one or more common columns.
merge_result = pd.merge(gdp_filtered, income_filtered, on=['Country', 'Year'], how='inner')
merge_result

In [None]:
# Filter out the NaN
result_filtered = merge_result.dropna()
result_filtered

In [None]:
# Rename columnname
result_filtered.rename(columns={'Gini coefficient (before tax) (World Inequality Database)': 'Gini coefficient'}, inplace=True)

In [706]:
# Set the 'Country' and 'Year' columns as the index of the dataframe
result_filtered.set_index(['Country', 'Year'], inplace=True)

In [707]:
# Initialize an empty dataframe to store the results
correlation_result = pd.DataFrame(columns=['Country', 'Year', 'Pearson Correlation'])

In [708]:
# Group the data by 'Country'
grouped = result_filtered.groupby('Country')

In [None]:
# Loop through each country and calculate the Pearson Correlation per year
for country, group in grouped:
    correlation = group['GDP per capita'].corr(group['Gini coefficient'], method='pearson')
    years = group.index.get_level_values('Year')
    correlation_result = pd.concat([correlation_result, pd.DataFrame({'Country': country, 'Year': years, 'Pearson Correlation': correlation}).reset_index(drop=True)])

In [710]:
# Reset the index of the resulting dataframe
correlation_result.reset_index(drop=True, inplace=True)

In [None]:
# Display the resulting dataFrame with Pearson Correlations
correlation_result

In [None]:
# Find the most common year in the dataframe
most_common_year = correlation_result['Year'].mode().iloc[0]
most_common_year

In [None]:
# Create a new dataframe with only the Pearson Correlation for the most common year
filtered_result = correlation_result[correlation_result['Year'] == most_common_year][['Country', 'Pearson Correlation']]
filtered_result

In [714]:
# Sort the dataframe by 'Pearson Correlation' in descending order to get the top countries
top = 100
filtered_result = filtered_result.sort_values(by='Pearson Correlation', ascending=True)[:top]

In [715]:
# Color coding based on the Pearson Correlation
colors = cm.RdYlGn(filtered_result['Pearson Correlation'])

In [716]:
# Add categories based on the Pearson Correlation values
categories = []

for corr in filtered_result['Pearson Correlation']:
    if corr >= 0.8:
        categories.append('0.8 - 1.0')
    elif corr >= 0.6:
        categories.append('0.6 - 0.8')
    elif corr >= 0.4:
        categories.append('0.4 - 0.6')
    elif corr >= 0.2:
        categories.append('0.2 - 0.4')
    elif corr >= 0.0:
        categories.append('0.0 - 0.2')
    elif corr >= -0.2:
        categories.append('-0.2 - 0.0')
    elif corr >= -0.4:
        categories.append('-0.4 - -0.2')
    elif corr >= -0.6:
        categories.append('-0.6 - -0.4')
    elif corr >= -0.8:
        categories.append('-0.8 - -0.6')
    else:
        categories.append('-1.0 - -0.8')

filtered_result['Category'] = categories

In [None]:
# This graph shows the result of the correlation coefficient for 2010. This is the year in which a comparison can be made for as many countries as possible from the available datasets.
# The closer to 1, the stronger the relationship between a country's Gross Domestrict Product(GDP) and its income inequality
plt.figure(figsize=(25, 12))
bars = plt.bar(filtered_result['Country'], filtered_result['Pearson Correlation'], color=colors)
plt.axhline(0, color='gray', linestyle='--')
plt.ylabel('Pearson Correlation')
plt.title(f'Pearson Correlation for 2010')
plt.gca().invert_xaxis()
plt.xticks(rotation=75)
plt.grid(axis='both', linestyle='--', alpha=0.8)
plt.tight_layout()

# Add a legend for the color usage with a specific position and width for the color bar
sm = cm.ScalarMappable(cmap=cm.RdYlGn, norm=plt.Normalize(vmin=-1, vmax=1))
sm.set_array([])
cax = plt.gcf().add_axes([1.0, 0.15, 0.02, 0.7])
cbar = plt.colorbar(sm, cax=cax)
cbar.set_label('Correlation Strength')

plt.show()