Question 1


In [None]:
######################################### PART A #########################################
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels import PanelOLS
import numpy as np
import patsy
from numpy.polynomial.polynomial import polyfit

# 1. Read the data
df = pd.read_csv("GMdata.csv",sep='\t')

# 2. Basic summary
print("\nDescriptive statistics:")
print(df.describe())

# 3. Check the time dimension per firm
#    Count how many years each firm has
year_counts = df.groupby('index')['yr'].nunique()

# 4. Identify the balanced panel
#    Suppose we expect 4 unique years (73, 78, 83, 88) for each firm
balanced_firms = year_counts[year_counts == 4].index

# Create separate dataframes
df_balanced = df[df['index'].isin(balanced_firms)].copy()
df_unbalanced = df[~df['index'].isin(balanced_firms)].copy()

print("\nNumber of firms in balanced panel:", len(balanced_firms))
print("Number of firms (total) in unbalanced panel:", df['index'].nunique())

# 5. Summarize balanced and unbalanced panels
print("\nBalanced panel summary:")
print(df_balanced.describe())

print("\nUnbalanced panel summary:")
print(df_unbalanced.describe())


Question 2

In [None]:
# Balanced panel regression (no dummies)
model_bal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                          data=df_balanced).fit()
print("OLS (Balanced), no dummies:")
print(model_bal_basic.summary())

In [None]:
# Unbalanced panel regression (no dummies)
model_unbal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                            data=df_unbalanced).fit()
print("\nOLS (Unbalanced), no dummies:")
print(model_unbal_basic.summary())

In [None]:
#Example with balanced panel and dummies
model_bal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr)+C(sic3)",
                            data=df_balanced).fit()
print("\nOLS (Balanced), with dummies:")
print(model_bal_dummies.summary())

In [None]:
# Example with unbalanced panel and dummies:
model_unbal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr) + C(sic3)",
                              data=df_unbalanced).fit()
print("\nOLS (Unbalanced), with year & industry dummies:")
print(model_unbal_dummies.summary())

Question 3

In [None]:
#Estimate the model with firm and time fixed effects (Unbalanced)
model_unbal_fe = smf.ols("ldsal ~ lemp + ldnpt + C(yr) + C(index)",
                                       data=df_unbalanced).fit()
print("\nPanel OLS (Unbalanced), with firm and time fixed effects:")
print(model_unbal_fe.summary())
df_unbalanced.describe()

Question 4

In [None]:
###################### Creating survival variable ######################

# 1. Sort by firm and year
df_unbalanced = df_unbalanced.sort_values(["index","yr"])

# 2. Identify the next year in which the firm appears
#    groupby("index") and shift() the 'yr' column by -1 to get the next row's year
df_unbalanced['next_yr'] = df_unbalanced.groupby('index')['yr'].shift(-1)

# Because your years jump in increments of 5, define survive_next=1 
# if next_yr == yr + 5 (i.e., the next wave). 0 otherwise.
df_unbalanced['survive_next'] = (
    df_unbalanced['next_yr'] == (df_unbalanced['yr'] + 5)
).astype(int)

# For the last observation of each firm (or if a firm does not appear in the next wave),
# survive_next will be 0. If there's no subsequent row for that firm, next_yr is NaN.

# 3. Drop rows with missing survive_next if you prefer to keep only firm-years
#    that can define survival. (Optional)
df_unbalanced = df_unbalanced.dropna(subset=['survive_next'])


print(df_unbalanced[['index','yr','survive_next']].head(20))




In [None]:
# drop observations for yr==88
df_unbalanced1 = df_unbalanced[df_unbalanced['yr']!=88]

print(df_unbalanced1[['index','yr','survive_next']].head(15))

In [None]:
model_formula = "survive_next ~ ldnpt + ldinv + C(yr) + C(sic3)"
probit_model = smf.probit(model_formula, data=df_unbalanced1).fit()
print(probit_model.summary())

df_unbalanced1['predicted_survival'] = probit_model.predict(df_unbalanced1)
df_unbalanced1['predicted_survival'].describe()

Question 5

In [None]:
# For convenience, rename columns:
df_unbalanced1["k"] = df_unbalanced1["ldnpt"]      # capital
df_unbalanced1["i"] = df_unbalanced1["ldinv"]     # investment

# 1) Guess labor coefficient from a naive OLS (without OP).
naive_ols = smf.ols("ldsal ~ lemp + k + i + I(k**2) + I(i**2) + I(k*i)", data=df_unbalanced1).fit()
beta_l_guess = naive_ols.params["lemp"]
print("Naive OLS labor coefficient:", beta_l_guess)

# 2) Construct y_tilde = y - beta_l * l
df_unbalanced1["y_tilde"] = df_unbalanced1["ldsal"] - beta_l_guess * df_unbalanced1["lemp"]

#get probit prediction for survival
df_unbalanced1['predicted_survival'] = probit_model.predict(df_unbalanced1)

#regress y_tilde on k lemp and predicted survival
model_stage1 = smf.ols("y_tilde ~ k + predicted_survival", data=df_unbalanced1).fit()
print(model_stage1.summary())









In [None]:

df_unbalanced1["predicted_survival_lag"] = df_unbalanced1.groupby("index")["predicted_survival"].shift(1)

# We'll define the second-stage dependent variable:
#   y_tilde = beta_0 + beta_k * k_t +  E[omega_t|omega_{t-1}] + ...
# We don’t necessarily estimate a separate intercept if we let E[omega_t|omega_{t-1}] handle it.

# A simple approach: polynomial in phi_hat_lag
df_unbalanced1.dropna(subset=["predicted_survival_lag"], inplace=True)  # must drop the first observation or missing lags

model_stage2 = smf.ols(
    formula="y_tilde ~ k + i + I(k**2) + I(i**2) + I(k*i) + I(predicted_survival) + I(predicted_survival_lag)",
    data=df_unbalanced1
).fit()
print(model_stage2.summary())


beta_k_op = model_stage2.params["k"]
print("Olley-Pakes capital coefficient (selection):", beta_k_op)

In [None]:
# Calculate market share for each firm
df_unbalanced['market_share'] = df_unbalanced['ldsal']/df_unbalanced['ldsal'].sum()

# Calculate weighted average productivity
df_unbalanced['weighted_productivity'] = df_unbalanced['market_share'] * df_unbalanced['ldsal']

# Aggregate productivity
aggregate_productivity = df_unbalanced.groupby('yr')['weighted_productivity'].sum()

# Calculate covariance between productivity and market share
cov_capital_market_share = df_unbalanced.groupby('yr').apply(lambda x: np.cov(x['ldnpt'], x['weighted_productivity'])[0, 1])

# OP decomposition for all sectors
op_decomposition_all = pd.DataFrame({
    'aggregate_productivity': aggregate_productivity,
    'cov_capital_market_share': cov_capital_market_share
})

print("OP Decomposition for all sectors:")
print(op_decomposition_all)

# Repeat the analysis for sic3 == 357
df_sic3_357 = df_unbalanced[df_unbalanced['sic3'] == 357].copy()

# Calculate market share for each firm in sic3 357
df_sic3_357['market_share'] = df_sic3_357['ldsal']/df_sic3_357['ldsal'].sum()

# Calculate weighted average productivity for sic3 357
df_sic3_357['weighted_productivity'] = df_sic3_357['market_share'] * df_sic3_357['ldsal']

# Aggregate productivity for sic3 357
aggregate_productivity_sic3_357 = df_sic3_357.groupby('yr')['weighted_productivity'].sum()


# Calculate covariance between productivity and market share for sic3 357
cov_capital_market_share_sic3_357 = df_sic3_357.groupby('yr').apply(lambda x: np.cov(x['ldnpt'], x['weighted_productivity'])[0, 1])

# OP decomposition for sic3 357
op_decomposition_sic3_357 = pd.DataFrame({
    'aggregate_productivity': aggregate_productivity_sic3_357,
    'cov_capital_market_share': cov_capital_market_share_sic3_357
})

print("\nOP Decomposition for sic3 357:")
print(op_decomposition_sic3_357)
