# **QUESTION 1.a**

In [None]:
######################################### PART A #########################################
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels import PanelOLS
import numpy as np
import patsy
from numpy.polynomial.polynomial import polyfit

# 1. Read the data
df = pd.read_csv("GMdata.csv",sep='\t')

# 2. Basic summary
print("\nDescriptive statistics:")
print(df.describe())

# 3. Check the time dimension per firm
#    Count how many years each firm has
year_counts = df.groupby('index')['yr'].nunique()

# 4. Identify the balanced panel
#    Suppose we expect 4 unique years (73, 78, 83, 88) for each firm
balanced_firms = year_counts[year_counts == 4].index

# Create separate dataframes
df_balanced = df[df['index'].isin(balanced_firms)].copy()
df_unbalanced = df[~df['index'].isin(balanced_firms)].copy()

print("\nNumber of firms in balanced panel:", len(balanced_firms))
print("Number of firms (total) in unbalanced panel:", df['index'].nunique())

# 5. Summarize balanced and unbalanced panels
print("\nBalanced panel summary:")
print(df_balanced.describe())

print("\nUnbalanced panel summary:")
print(df_unbalanced.describe())



Descriptive statistics:
             index         sic3           yr        ldsal         lemp  \
count  2971.000000  2971.000000  2971.000000  2971.000000  2971.000000   
mean    696.203299   331.455739    80.489061     5.673087     1.259177   
std     404.779371    51.952189     5.351874     1.960717     1.775248   
min       1.000000   200.000000    73.000000    -0.857349    -3.772261   
25%     343.500000   286.000000    78.000000     4.250526    -0.024805   
50%     696.000000   356.000000    78.000000     5.529348     1.114157   
75%    1048.000000   367.000000    83.000000     7.083825     2.631889   
max    1400.000000   399.000000    88.000000    11.698400     6.732211   

             ldnpt        ldrst        ldrnd        ldinv  
count  2971.000000  2971.000000  2971.000000  2971.000000  
mean      4.468996     3.400962     1.787530     2.674828  
std       2.216520     2.028775     2.052410     2.170476  
min      -1.389284    -4.287164    -5.313206    -3.844328  
25%     

# **QUESTION 1.b**

In [35]:
# Balanced panel regression (no dummies)
model_bal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                          data=df_balanced).fit()
print("OLS (Balanced), no dummies:")
print(model_bal_basic.summary())

OLS (Balanced), no dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.913
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     4496.
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:40:05   Log-Likelihood:                -688.18
No. Observations:                 856   AIC:                             1382.
Df Residuals:                     853   BIC:                             1397.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7235   

In [36]:
# Unbalanced panel regression (no dummies)
model_unbal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                            data=df_unbalanced).fit()
print("\nOLS (Unbalanced), no dummies:")
print(model_unbal_basic.summary())


OLS (Unbalanced), no dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     9519.
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:40:34   Log-Likelihood:                -1780.4
No. Observations:                2115   AIC:                             3567.
Df Residuals:                    2112   BIC:                             3584.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.2322

In [41]:
#Example with balanced panel and dummies
model_bal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr)+C(sic3)",
                            data=df_balanced).fit()
print("\nOLS (Balanced), with dummies:")
print(model_bal_dummies.summary())


OLS (Balanced), with dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.961
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     323.0
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:44:12   Log-Likelihood:                -343.66
No. Observations:                 856   AIC:                             811.3
Df Residuals:                     794   BIC:                             1106.
Df Model:                          61                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept    

In [42]:
# Example with unbalanced panel and dummies:
model_unbal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr) + C(sic3)",
                              data=df_unbalanced).fit()
print("\nOLS (Unbalanced), with year & industry dummies:")
print(model_unbal_dummies.summary())


OLS (Unbalanced), with year & industry dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     240.7
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:44:52   Log-Likelihood:                -1293.5
No. Observations:                2115   AIC:                             2835.
Df Residuals:                    1991   BIC:                             3537.
Df Model:                         123                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

# **QUESTION 1.c**

In [38]:

# First, we need a multi-index: (firm, year)
df_panel = df_unbalanced.set_index(['index','yr']).sort_index()

# The PanelOLS formula approach:
# Here 'ldsal' is the dependent variable,
# 'lemp' and 'ldpt' are regressors, plus we include "EntityEffects" and "TimeEffects".
fe_model = PanelOLS.from_formula(
    formula='ldsal ~ 1 + lemp + ldnpt + EntityEffects + TimeEffects',
    data=df_panel
).fit()
print(fe_model)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ldsal   R-squared:                        0.5329
Estimator:                   PanelOLS   R-squared (Between):              0.8836
No. Observations:                2115   R-squared (Within):               0.5269
Date:                Thu, Feb 27 2025   R-squared (Overall):              0.8767
Time:                        01:40:52   Log-likelihood                    299.68
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      527.17
Entities:                        1186   P-value                           0.0000
Avg Obs:                       1.7833   Distribution:                   F(2,924)
Min Obs:                       1.0000                                           
Max Obs:                       3.0000   F-statistic (robust):             527.17
                            

In [39]:
#Predict probability of survival of a firm using probit model

# 1. Define the dependent variable
df['survived'] = (df['ldsal'] > 0).astype(int)

# 2. Define the model
model_probit = smf.probit("survived ~ lemp + ldnpt", data=df).fit()
print("\nProbit model:")
print(model_probit.summary())

# 3. Predict the probability of survival
df['survival_prob'] = model_probit.predict(df)

# 4. Check the predictions
print("\nPredictions:")
print(df[['index','yr','survived','survival_prob']].head(10))

#summary of survival probability
print("\nSummary of survival probability:")
print(df['survival_prob'].describe())

Optimization terminated successfully.
         Current function value: 0.003901
         Iterations 14

Probit model:
                          Probit Regression Results                           
Dep. Variable:               survived   No. Observations:                 2971
Model:                         Probit   Df Residuals:                     2968
Method:                           MLE   Df Model:                            2
Date:                Thu, 27 Feb 2025   Pseudo R-squ.:                  0.5108
Time:                        01:41:17   Log-Likelihood:                -11.590
converged:                       True   LL-Null:                       -23.693
Covariance Type:            nonrobust   LLR p-value:                 5.544e-06
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.2195      1.529      3.414      0.001       2.223       8.216
lemp         

In [None]:


# Assume we have: y = ldsal, labor = lemp, capital = ldpt, investment = ldinv.
# 1) First stage: y - beta_l * l = phi(k, i) + eps
#    Where phi(...) is approximated by a polynomial or spline in k and i.

# We guess an initial labor coefficient from OLS or from prior knowledge:
initial_beta_l = 0.5  # example placeholder

# Construct the dependent variable for the first stage
df_unbalanced['y_tilde'] = df_unbalanced['ldsal'] - initial_beta_l * df_unbalanced['lemp']

# Let's do a simple polynomial in capital and investment:
df_unbalanced['k'] = df_unbalanced['ldpt']
df_unbalanced['i'] = df_unbalanced['ldinv']

# Example: a second-order polynomial in (k, i)
#  y_tilde = a0 + a1*k + a2*i + a3*k^2 + a4*i^2 + a5*k*i + error
first_stage_model = smf.ols("y_tilde ~ k + i + I(k**2) + I(i**2) + I(k*i)",
                            data=df_unbalanced).fit()
print(first_stage_model.summary())

# Predicted "phi_hat"
df_unbalanced['phi_hat'] = first_stage_model.fittedvalues

# 2) Second stage: identify the capital coefficient by projecting out the expected productivity
#    and controlling for survival or selection. In OP, we use the "phi_hat" and "phi_hat_lag"
#    plus survival corrections. The details can be quite involved.

# For illustration:
df_unbalanced['phi_hat_lag'] = df_unbalanced.groupby('index')['phi_hat'].shift(1)

# Then we might run:
second_stage_model = smf.ols("ldsal ~ lemp + ldpt + phi_hat_lag",
                             data=df_unbalanced.dropna()).fit()
print(second_stage_model.summary())

# The coefficient on ldpt would be your OP estimate of capital's coefficient.
# If you want to incorporate selection controls, you'd add an inverse Mills ratio
# from a survival probit, or a polynomial in phi_hat_lag. This is just a skeleton.

NameError: name 'df_unbalanced' is not defined

In [8]:
# 1) Suppose second_stage_model from OP gave us fitted values or residuals as firm TFP
df_unbalanced['tfp_hat'] = second_stage_model.resid  # or fitted productivity measure

# 2) Compute firm-year market share within each SIC3:
df_unbalanced['sector_sales'] = df_unbalanced.groupby(['sic3','yr'])['ldsal'].transform(lambda x: x.sum())
# CAREFUL: 'ldsal' is log(sales). We might want actual sales, not logs.
# If you only have logs, do something like:
df_unbalanced['sales'] = df_unbalanced['ldsal'].apply(np.exp)  # approximate real sales
df_unbalanced['sector_sales'] = df_unbalanced.groupby(['sic3','yr'])['sales'].transform('sum')
df_unbalanced['mshare'] = df_unbalanced['sales'] / df_unbalanced['sector_sales']

# 3) Weighted TFP at sector-year level
df_unbalanced['weighted_tfp'] = df_unbalanced['mshare'] * df_unbalanced['tfp_hat']

# 4) Aggregate by sector-year
sector_year_tfp = df_unbalanced.groupby(['sic3','yr'])['weighted_tfp'].sum().reset_index()
sector_year_tfp.rename(columns={'weighted_tfp':'aggregate_tfp'}, inplace=True)

print(sector_year_tfp)

NameError: name 'second_stage_model' is not defined