In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import functions as func

In [2]:
# read in csv file and creating an original df which we wont change. dropping certain unnecessary columns and renaming the necessary columns.

csv_file_path = r'..\Data\GDP_growth\real-gdp-growth.csv'
df_gdp_rate_original = pd.read_csv(csv_file_path)
df_gdp_rate = df_gdp_rate_original.copy()
df_gdp_rate = df_gdp_rate.drop(columns=['Gross domestic product, constant prices - Percent change - Forecasts'])
df_gdp_rate.rename(columns={'Entity': 'country', 'Gross domestic product, constant prices - Percent change - Observations' : 'gdp_change_%_observation', 'Year' : 'year'}, inplace=True)

In [3]:
# Because the education dataset only has data per 5 years we change the Gdp set to 5 years intervals. We adjust the change in rate to sum up all increases so we have te correct data.

df_gdp_rate['year'] = df_gdp_rate['year'] // 5 * 5
df_gdp_rate = df_gdp_rate.groupby(['country', 'year']).agg({'gdp_change_%_observation': 'sum'}).reset_index()

In [4]:
# read in csv file and creating an original df which we wont change. Dropping unnecessary columns.

csv_file_path = r'..\Data\Education\OUP_long_MF2564_v1.csv'
df_edu_original = pd.read_csv(csv_file_path)
df_edu = df_edu_original.copy()
df_edu = df_edu.drop(columns=['BLcode', 'WBcode' , 'region_code', 'sex'] , errors='ignore')


df_edu = func.edu_perc(df_edu)
# df_edu['yr_sch_%'] = df_edu.groupby('country')['yr_sch'].pct_change() * 100
# df_edu['yr_sch_pri_%'] = df_edu.groupby('country')['yr_sch_pri'].pct_change() * 100
# df_edu['yr_sch_sec_%'] = df_edu.groupby('country')['yr_sch_sec'].pct_change() * 100
# df_edu['yr_sch_ter_%'] = df_edu.groupby('country')['yr_sch_ter'].pct_change() * 100
# df_edu['lp_%'] = df_edu.groupby('country')['lp'].pct_change() * 100
# df_edu['lpc_%'] = df_edu.groupby('country')['lpc'].pct_change() * 100
# df_edu['lsc_%'] = df_edu.groupby('country')['lsc'].pct_change() * 100
# df_edu['lhc_%'] = df_edu.groupby('country')['lhc'].pct_change() * 100
# df_edu['pop_%'] = df_edu.groupby('country')['pop'].pct_change() * 100

df_edu = df_edu[df_edu['year'] >= 1980]
df_edu.head()

KeyError: "['gdp', 'gdp_%_change', 'gdp_per_capita', 'gdp_%_change_capita'] not in index"

In [None]:
# Merging Gdp and Education data sets on country and year, dropping unnecessary columns. Also resetting the index on country

merged_df = pd.merge(df_gdp_rate, df_edu, on=['country', 'year'], how='inner')
merged_df = merged_df.drop(columns=['BLcode', 'WBcode' , 'region_code', 'sex', 'Code'] , errors='ignore')
pd.set_option('display.max_columns', None)
merged_df.set_index('country', inplace=True)
merged_df.head(10)

In [None]:
# Checking for null and if value which will be problematic for calculations/visualization later.
print(merged_df['gdp_change_%_observation'].isnull().sum())
print(np.isinf(merged_df['gdp_change_%_observation']).sum())

print(merged_df['lhc_%'].isnull().sum())
print(np.isinf(merged_df['lhc_%']).sum())

In [None]:


merged_df.dropna(subset=['gdp_change_%_observation', 'lhc_%'], inplace=True)
merged_df = merged_df[~np.isinf(merged_df['lhc_%'])]

correlation = merged_df['yr_sch_%'].corr(merged_df['gdp_change_%_observation'])
print(correlation)

In [None]:
df_clean = merged_df.dropna()
df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna()
X = df_clean['gdp_change_%_observation']
y = df_clean['lpc_%']

X = sm.add_constant(X)


model = sm.OLS(y, X).fit()
print(model.summary())


plt.scatter(X['gdp_change_%_observation'], y, alpha=0.5, label='Data')
plt.plot(X['gdp_change_%_observation'], model.predict(X), color='red', label='Regression Line')
plt.xlabel('gdp_change_%_observation')
plt.ylabel('lpc_%')
plt.title('Linear Regression Plot')
plt.legend()
plt.show()

In [None]:
merged_df.columns


In [None]:
df_clean = merged_df.dropna()
df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna()

z_scores_gdp = stats.zscore(df_clean['gdp_change_%_observation'])
z_scores_lpc = stats.zscore(df_clean['lpc_%'])
threshold = 3

merged_df_no_extreme = df_clean[(abs(z_scores_gdp) < threshold) & (abs(z_scores_lpc) < threshold)]

X = merged_df_no_extreme['gdp_change_%_observation']
y = merged_df_no_extreme['lpc_%']

X = sm.add_constant(X)


model = sm.OLS(y, X).fit()
print(model.summary())


plt.scatter(X['gdp_change_%_observation'], y, alpha=0.5, label='Data')
plt.plot(X['gdp_change_%_observation'], model.predict(X), color='red', label='Regression Line')
plt.xlabel('gdp_change_%_observation')
plt.ylabel('lpc_%')
plt.title('Linear Regression Plot')
plt.legend()
plt.show()

In [None]:
correlation = merged_df_no_extreme['gdp_change_%_observation'].corr(merged_df_no_extreme['lpc_%'])
print(correlation)

Overall, while there appears to be a positive relationship between the percentage change in GDP and the percentage change in people who finished primary school, the low R-squared value and the relatively high p-value indicate that the relationship may not be very strong or statistically significant. 