# ESTIMATIONS AND RESULTS

# Reset everything and redo data cleaning

In [474]:
%reset -f

In [475]:
import pandas as pd 
import numpy as np 
import numpy.linalg as la
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from scipy.stats import norm
from sklearn.preprocessing import PolynomialFeatures

# Reads same data as before

In [476]:
dat = pd.read_csv('growth.csv')
lbldf = pd.read_csv('labels.csv', index_col='variable')
lbl_all = lbldf.label.to_dict() # as a dictionary
print(f'The data contains {dat.shape[0]} rows (countries) and {dat.shape[1]} columns (variables).')

The data contains 214 rows (countries) and 85 columns (variables).


# Defines variable groups

In [477]:
# all available variables
# 'demCGV', 'demBMR',
vv_institutions = ['marketref', 'dem', 'demreg']
vv_geography = [
        'tropicar','distr', 'distcr', 'distc','suitavg','temp', 'suitgini', 'elevavg', 'elevstd',
        'kgatr', 'precip', 'area', 'abslat', 'cenlong', 'area_ar', 'rough','landlock', 
        'africa',  'asia', 'oceania', 'americas' # 'europe' is the reference
]
#'pdiv_aa', 'pdivhmi', 'pdivhmi_aa'
# vv_historical = ['pd1000', 'pd1500', 'pop1000', 'pop1500', 'ln_yst']

vv_geneticdiversity = ['pdiv'] # these are often missing: ['pd1', 'pop1']
vv_religion = ['pprotest', 'pcatholic', 'pmuslim']
# 'uvdamage'
vv_danger = ['yellow', 'malfal'] # Diseases
vv_resources = ['oilres', 'goldm', 'iron', 'silv', 'zinc']
vv_educ = ['ls_bl', 'lh_bl'] # secondary, tertiary: we exclude 'lp_bl' (primary) to avoid rank failure 
vv_popgrowth = ['pop_growth']

#'historical': vv_historical,
vv_all = {'institutions': vv_institutions, 
          'geography': vv_geography, 
          'geneticdiversity': vv_geneticdiversity,
          'religion': vv_religion,
          'danger':vv_danger, 
          'resources':vv_resources,
          'popgrowth': vv_popgrowth,
        'education': vv_educ
         }
list_of_lists = vv_all.values()
vv_all['all'] = [v for sublist in list_of_lists for v in sublist]

# Variables we use
# Institutions, geography, religion, danger (health), resources, education, pop growth

# Code job for Lucas
# Use variables choosen
# Graph Lasso with different penalty levels
# Lasso with CV, BRT and BCCH at difference penalty levels like in week 4 exercise
# Use BRT and BCCH optimal penalty level and use dobbel lasso and post dobbelt lasso.



In [478]:
# convenient to keep a column of ones in the dataset
dat['constant'] = np.ones((dat.shape[0],))

In [479]:
# 1. avoiding missings 
I = dat[['gdp_growth', 'lgdp_initial']].notnull().all(axis=1)

# Define what we want to estimate
xs = ['constant','lgdp_initial']


# 2. extract dataset 
y = dat.loc[I, 'gdp_growth'].values.reshape((-1,1)) * 100.0
X = dat.loc[I, ['constant','lgdp_initial']].values

# 3. run OLS
betahat = np.linalg.inv(X.T @ X) @ X.T @ y
print(betahat)

tab = pd.DataFrame({'beta': betahat.flatten()}, index=xs)
tab

[[ 2.24792211]
 [-0.08713416]]


Unnamed: 0,beta
constant,2.247922
lgdp_initial,-0.087134


# Defines X and variables

In [480]:
controls = vv_all['geography'] + vv_all['institutions'] + vv_all['religion'] + vv_all['danger'] + vv_all['resources'] + vv_all['popgrowth'] + vv_all['education']
vars = ['lgdp_initial'] + controls
all_vars = ['gdp_growth'] + vars 

I = dat[all_vars].notnull().all(1)

In [481]:
y = dat.loc[I,'gdp_growth'].values.reshape((-1,1)) * 100. #easier to read output when growth is in 100%
d = dat.loc[I ,'lgdp_initial']
X = dat.loc[I, vars].values
Z_basic = dat.loc[I , controls].values

In [482]:
Z = PolynomialFeatures(degree=2, include_bias=False).fit_transform(Z_basic)

# Makes Standarization

In [483]:
# Create a function for standardizing
def standardize(X):

    X_stan = (X - np.mean(X, axis=0))/np.std(X, axis=0, ddof=1)
    return X_stan

# Standardize data
X_stan = standardize(X)
d_stan = standardize(d)
Z_stan = standardize(Z_basic)
Z_pol_stan = standardize(Z)


  X_stan = (X - np.mean(X, axis=0))/np.std(X, axis=0, ddof=1)


In [484]:
# Tests
print(f'Mean y = {y.mean(): 5.2f}% growth per year')
print(f'Mean d = {d.mean(): 5.2f} log initial GDP per capita')


Mean y =  1.58% growth per year
Mean d =  7.90 log initial GDP per capita


# Check rank

In [485]:
assert np.linalg.matrix_rank(X) == X.shape[1], f'X does not have full rank'

In [486]:
# First let's check the dimensions of X
print(f"X shape: {X.shape}")

# Check for constant columns
constant_cols = np.where(np.std(X, axis=0) == 0)[0]
print(f"Constant columns: {[vars[i] for i in constant_cols]}")

# Check correlations
corr_matrix = pd.DataFrame(X, columns=vars).corr()
high_corr = np.where(np.abs(corr_matrix) > 0.90)
high_corr_pairs = [(vars[i], vars[j]) for i, j in zip(*high_corr) if i < j]
print("\nHighly correlated pairs (>0.90):")
for pair in high_corr_pairs:
    print(f"{pair[0]} - {pair[1]}")


X shape: (72, 38)
Constant columns: []

Highly correlated pairs (>0.90):


# 1: OLS

In [487]:
N = X.shape[0]

In [488]:
# Add a constant to X
xx = np.column_stack((np.ones(N),X))

# Reshape y
yy = np.array(y).reshape(-1,1)

# Calculate OLS estimate
coefs_OLS = la.inv(xx.T@xx)@xx.T@yy
alpha_OLS = coefs_OLS[1][0]

# Calculate residuals
res_OLS = yy - xx@coefs_OLS

# Display alpha
print("alpha_OLS = ",alpha_OLS.round(6))

alpha_OLS =  -0.922987


# Double Post Lasso no polynomial

Define BRT penalty

In [489]:
# Make a function that calculates BRT. Hint: You implemented a version of this last week
def BRT(X_tilde,y):
    (N,p) = X_tilde.shape
    sigma = np.std(y, ddof=1)
    c=1.1
    alpha=0.05

    penalty_BRT= (sigma*c)/np.sqrt(N)*norm.ppf(1-alpha/(2*p))

    return penalty_BRT

In [490]:
# Calculate BRT
penalty_BRTyx = BRT(X_stan, y)
print("lambda_BRT =",penalty_BRTyx.round(2))

lambda_BRT = 0.6


BCCH

In [491]:
def BCCH(X,y):

    n,p = X.shape
    c = 1.1; alpha = 0.05

    coef_pilot = Lasso(alpha).fit(X,y).coef_
    coef_intercept = Lasso(alpha).fit(X,y).intercept_
    pred = (coef_intercept + X@coef_pilot)

    res = y - pred

    resXscale = (np.max((X.T ** 2.0) @ (res ** 2.0) / n)) ** 0.5
    lambda_bcch = c*norm.ppf(1.0-alpha/(2.0*p))*resXscale/np.sqrt(n)    
    penalty_BCCH = lambda_bcch

    return penalty_BCCH  

In [492]:
# Calculate BCCH
penalty_BCCH  = BCCH(X_stan, y)
print("lambda_BCCH =",penalty_BCCH.round(2))

lambda_BCCH = 2.41


In [493]:
# Run Lasso 
fit_BRTyx = Lasso(penalty_BRTyx, max_iter=10000).fit(X_stan, y)
coefs = fit_BRTyx.coef_

# Calculate residuals
resyx = y - fit_BRTyx.predict(X_stan)

# Calculate Y - Z@gamma (epsilon + alpha*d)
# Hint: You only need the variables given to you in this cell, in addition
# to a standardized data set you made previoously.
resyxz = resyx + d_stan*coefs[0]

# Display first coefficient
print("First coefficient =",coefs[0].round(5))

ValueError: Data must be 1-dimensional, got ndarray of shape (72, 72) instead

Calculates BRT for treatment

In [None]:
# Calculate BRT
penalty_BRTdz = BRT(Z_stan, d)

In [None]:
# Run Lasso
fit_BRTdz = Lasso(penalty_BRTdz, max_iter=10000).fit(Z_stan, d)
coefs=fit_BRTdz.coef_

# Calculate residuals
resdz=d-fit_BRTdz.predict(Z_stan)

# Display first coefficient
print("First coefficient =",coefs[0].round(5))

First coefficient = 0.0


Estimates alpha

In [None]:
# Calculate alpha
num = resdz@resyxz
denom = resdz@d
alpha_PDL = num/denom

# Display alpha
print("alpha_PDL = ",alpha_PDL.round(2))

alpha_PDL =  [-0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16
 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16
 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16
 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16
 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16
 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16 -0.16]


# Double Lasso with Polynomial

# Post partial lasso with no polynomial

# Post partial lasso with polynomial