## Task 1

### Olley-Pakes

In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.optimize import minimize, least_squares

In [2]:
oil = pd.read_stata("C:/Users/Popov/Documents/Studies/NES_studies/Stata/EIO/HA/HA1/oil.dta")

In [3]:
oil.head(3)

Unnamed: 0,okpo,companyname,year,fixedassets,sales,profit,profitaftertax,employees,wages,postcode,region,okved2008
0,75775799,rekaveri,2008.0,5.684062,0.0,0.0,0.0,2.0,,169710,Komi Republic,111011
1,75775799,rekaveri,2009.0,0.264514,0.0,0.0,-484.522644,2.0,,169710,Komi Republic,111011
2,75775799,rekaveri,2010.0,0.0,0.0,0.0,-418.413879,2.0,,169710,Komi Republic,111011


#### Data cleaning

In [4]:
oil.describe()

Unnamed: 0,year,fixedassets,sales,profit,profitaftertax,employees,wages,postcode,okved2008
count,6464.0,6464.0,6464.0,6125.0,6464.0,4811.0,1601.0,6472.0,6472.0
mean,2009.148824,388434.2,281676.2,173820.1,44476.63,674.4656,19354.84,369122.509116,111008.271477
std,2.640536,4488343.0,3006524.0,2892796.0,416073.8,4901.627774,118949.7,210124.791289,6.775765
min,2005.0,0.0,-41.76698,-95583.96,-5228478.0,1.0,0.03474635,100000.0,111000.0
25%,2007.0,0.244437,0.0,0.0,-29.42076,2.0,123.6998,125239.0,111000.0
50%,2009.0,625.0586,108.4843,0.0,0.0,32.0,1033.553,410056.0,111011.0
75%,2012.0,19562.64,12354.63,2281.163,339.23,206.0,5235.352,625000.0,111011.0
max,2014.0,181367100.0,143419300.0,143419300.0,11243780.0,103514.0,2103050.0,694460.0,111030.0


##### Define investment

In [5]:
oil['investment'] = 0

for i in range(oil.shape[0]):
    # Current fixed assets
    current_assets = oil['fixedassets'][i]
    
    # Next year's fixed assets for the same 'okpo'
    next_year_assets = oil.loc[(oil['okpo'] == oil['okpo'][i]) & (oil['year'] == (oil['year'][i] + 1)), 'fixedassets']
    
    # Calculate investment or assign NaN
    if next_year_assets.shape[0] > 0:
        oil['investment'][i] = max(0, next_year_assets.iloc[0] - current_assets)
    else:
        oil['investment'][i] = np.NaN

In [6]:
print(oil['investment'].describe())

count    4.566000e+03
mean     9.656358e+04
std      1.756652e+06
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.410462e+03
max      8.463550e+07
Name: investment, dtype: float64


##### Clean data to allow for logarithms

In [7]:
oil.replace([np.inf, -np.inf], np.nan, inplace=True)

##### Create exponents for our polynoms

In [8]:
oil['lnQ'] = np.log(oil['sales'])

oil['lnL'] = np.log(oil['employees'])

oil['lnI'] = np.log(oil['investment'])
oil['lnI_2'] = oil['lnI']**2
oil['lnI_3'] = oil['lnI']**3

oil['lnK'] = np.log(oil['fixedassets'])
oil['lnK_2'] = oil['lnK']**2
oil['lnK_3'] = oil['lnK']**3

Create lagged values

In [9]:
oil['lnQ_L1'] = 0
oil['lnL_L1'] = 0
oil['lnK_L1'] = 0

for i in range(oil.shape[0]):
    lnK_L1 = oil.loc[(oil['okpo'] == oil['okpo'][i]) & (oil['year'] == (oil['year'][i]-1)), 'lnK']
    if lnK_L1.shape[0] > 0:
        oil['lnK_L1'][i] = lnK_L1.iloc[0]
    else:
        oil['lnK_L1'][i] = np.NaN
    
    lnL_L1 = oil.loc[(oil['okpo'] == oil['okpo'][i]) & (oil['year'] == (oil['year'][i]-1)), 'lnL']
    if lnL_L1.shape[0] > 0:
        oil['lnL_L1'][i] = lnL_L1.iloc[0]
    else:
        oil['lnL_L1'][i] = np.NaN
    
    lnQ_L1 = oil.loc[(oil['okpo'] == oil['okpo'][i]) & (oil['year'] == (oil['year'][i]-1)), 'lnQ']
    if lnQ_L1.shape[0] > 0:
        oil['lnQ_L1'][i] = lnQ_L1.iloc[0]
    else:
        oil['lnQ_L1'][i] = np.NaN

In [10]:
oil.head(5)

Unnamed: 0,okpo,companyname,year,fixedassets,sales,profit,profitaftertax,employees,wages,postcode,...,lnL,lnI,lnI_2,lnI_3,lnK,lnK_2,lnK_3,lnQ_L1,lnL_L1,lnK_L1
0,75775799,rekaveri,2008.0,5.684062,0.0,0.0,0.0,2.0,,169710,...,0.693147,-inf,inf,-inf,1.737666,3.019484,5.246854,,,
1,75775799,rekaveri,2009.0,0.264514,0.0,0.0,-484.522644,2.0,,169710,...,0.693147,-inf,inf,-inf,-1.329863,1.768535,-2.35191,-inf,0.693147,1.737666
2,75775799,rekaveri,2010.0,0.0,0.0,0.0,-418.413879,2.0,,169710,...,0.693147,,,,-inf,inf,-inf,-inf,0.693147,-1.329863
3,74066994,unistroi,2005.0,0.38221,18.276581,18.276581,0.451703,15.0,,103055,...,2.70805,,,,-0.961785,0.925031,-0.889682,,,
4,73614678,interneft,2007.0,0.0,0.0,0.0,0.0,,,460000,...,,,,,-inf,inf,-inf,,,


### First stage ( $q_jt$ )

In [11]:
samp = oil[['okpo', 'year','lnQ', 'lnL','lnK','lnI', 'lnQ_L1', 'lnL_L1', 'lnK_L1', 'lnK_2', 'lnI_2', 'lnK_3', 'lnI_3']]
samp.replace([np.inf, -np.inf], np.nan, inplace=True)
samp = samp.dropna()
samp.reset_index(drop=True, inplace=True) 

In [12]:
samp.head(3)

Unnamed: 0,okpo,year,lnQ,lnL,lnK,lnI,lnQ_L1,lnL_L1,lnK_L1,lnK_2,lnI_2,lnK_3,lnI_3
0,51006317,2009.0,7.528523,3.7612,8.870316,6.434381,7.948603,3.688879,8.897689,78.682495,41.401261,697.938599,266.39149
1,51006317,2012.0,8.111992,3.688879,8.663717,9.879489,8.367654,3.931826,8.855697,75.059998,97.604313,650.298584,964.280779
2,82645181,2009.0,3.297558,2.197225,5.598308,5.735543,0.283234,1.098612,4.707927,31.341047,32.896454,175.456833,188.679027


In [13]:
Y = samp['lnQ']
X = samp[['lnL', 'lnK','lnI', 'lnK_2', 'lnI_2', 'lnK_3', 'lnI_3']]
X = sm.add_constant(X)

FS = sm.OLS(Y,X).fit(cov_type="hc1")

print(FS.summary())

                            OLS Regression Results                            
Dep. Variable:                    lnQ   R-squared:                       0.669
Model:                            OLS   Adj. R-squared:                  0.667
Method:                 Least Squares   F-statistic:                     286.5
Date:                Fri, 04 Oct 2024   Prob (F-statistic):          1.13e-235
Time:                        17:19:37   Log-Likelihood:                -2106.2
No. Observations:                1027   AIC:                             4228.
Df Residuals:                    1019   BIC:                             4268.
Df Model:                           7                                         
Covariance Type:                  hc1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.2303      0.577      5.601      0.0

### Second stage ( $q_{jt} - \hat{\beta} I_{jt}$ )

In [14]:
samp['phi_t'] = samp['lnQ'] - FS.params[1]*samp['lnL']
samp['phi_L1'] = samp['lnQ_L1'] - FS.params[1]*samp['lnL_L1']

def get_resid(Coefs): #add const
    pi_k = Coefs[0:3]
    alpha_K = Coefs[3]
    residuals = samp['phi_t'] - 3.2303 - alpha_K*samp['lnK'] - \
    pi_k[0]*(samp['lnQ_L1'] - 3.2303 -alpha_K*samp['lnK']) \
    + pi_k[1]*(samp['lnQ_L1'] - 3.2303 -alpha_K*samp['lnK'])**2 \
    + pi_k[2]*(samp['lnQ_L1'] - 3.2303 -alpha_K*samp['lnK'])**3
    return np.array(residuals)

def Opt(X0):
    result = least_squares(
        fun=get_resid,
        x0=X0, bounds=(0, 1), ftol=1e-8, xtol=1e-8, gtol=1e-8
    )
    return result.x

In [15]:
Opt([0.1, 0.2, 0.5, 0])

array([2.62681758e-01, 5.52617653e-02, 8.86549942e-43, 3.28530043e-01])

## Task 3

### Logit


In [16]:
ins = pd.read_stata \
    ("C:/Users/Popov/Documents/Studies/NES_studies/Stata/EIO/HA/HA1/insured.dta")

In [17]:
ins.head(3)

Unnamed: 0,healthy,age,male,insured,deg_nd,deg_ged,deg_hs,deg_ba,deg_ma,deg_phd,...,married,selfemp,familysz,reg_ne,reg_mw,reg_so,reg_we,race_bl,race_ot,race_wht
0,1.0,31.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,31.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,54.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
ins.describe()

Unnamed: 0,healthy,age,male,insured,deg_nd,deg_ged,deg_hs,deg_ba,deg_ma,deg_phd,...,married,selfemp,familysz,reg_ne,reg_mw,reg_so,reg_we,race_bl,race_ot,race_wht
count,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,...,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0,8802.0
mean,0.928539,38.936832,0.526358,0.801182,0.12713,0.04249,0.503749,0.175983,0.059532,0.015337,...,0.617246,0.121677,3.093502,0.191093,0.229834,0.349352,0.229721,0.12304,0.041468,0.835492
std,0.257608,11.110823,0.499333,0.399134,0.333138,0.201716,0.500014,0.380827,0.236631,0.122898,...,0.486087,0.326931,1.559633,0.393184,0.42075,0.476793,0.420677,0.328502,0.199381,0.370757
min,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,39.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,48.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,1.0,62.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Define the dependent and independent variables
X = ins[['healthy', 'age', 'male', 'deg_nd', 'deg_ged', 'deg_hs', 'deg_ba', 'deg_ma', 'deg_phd', 
        'married', 'selfemp', 'familysz', 'reg_ne', 'reg_mw', 'reg_so', 'race_bl', 'race_ot']]
y = ins['insured']

# Add a constant 
X = sm.add_constant(X)

# Fit the logit
logit_model = sm.Logit(y, X).fit(cov_type='HC2')

# Print the summary
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.427051
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                insured   No. Observations:                 8802
Model:                          Logit   Df Residuals:                     8784
Method:                           MLE   Df Model:                           17
Date:                Fri, 04 Oct 2024   Pseudo R-squ.:                  0.1438
Time:                        17:19:38   Log-Likelihood:                -3758.9
converged:                       True   LL-Null:                       -4390.1
Covariance Type:                  HC2   LLR p-value:                5.516e-258
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5547      0.218      2.544      0.011       0.127       0.982
healthy        0.2562      0.

In [20]:
# Get the LaTeX output
latex_output = logit_model.summary().as_latex()

with open("logit_results.tex", "w") as f:
    f.write(latex_output)