# **QUESTION 1.a**

In [None]:
######################################### PART A #########################################
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels import PanelOLS
import numpy as np
import patsy
from numpy.polynomial.polynomial import polyfit

# 1. Read the data
df = pd.read_csv("GMdata.csv",sep='\t')

# 2. Basic summary
print("\nDescriptive statistics:")
print(df.describe())

# 3. Check the time dimension per firm
#    Count how many years each firm has
year_counts = df.groupby('index')['yr'].nunique()

# 4. Identify the balanced panel
#    Suppose we expect 4 unique years (73, 78, 83, 88) for each firm
balanced_firms = year_counts[year_counts == 4].index

# Create separate dataframes
df_balanced = df[df['index'].isin(balanced_firms)].copy()
df_unbalanced = df[~df['index'].isin(balanced_firms)].copy()

print("\nNumber of firms in balanced panel:", len(balanced_firms))
print("Number of firms (total) in unbalanced panel:", df['index'].nunique())

# 5. Summarize balanced and unbalanced panels
print("\nBalanced panel summary:")
print(df_balanced.describe())

print("\nUnbalanced panel summary:")
print(df_unbalanced.describe())



Descriptive statistics:
             index         sic3           yr        ldsal         lemp  \
count  2971.000000  2971.000000  2971.000000  2971.000000  2971.000000   
mean    696.203299   331.455739    80.489061     5.673087     1.259177   
std     404.779371    51.952189     5.351874     1.960717     1.775248   
min       1.000000   200.000000    73.000000    -0.857349    -3.772261   
25%     343.500000   286.000000    78.000000     4.250526    -0.024805   
50%     696.000000   356.000000    78.000000     5.529348     1.114157   
75%    1048.000000   367.000000    83.000000     7.083825     2.631889   
max    1400.000000   399.000000    88.000000    11.698400     6.732211   

             ldnpt        ldrst        ldrnd        ldinv  
count  2971.000000  2971.000000  2971.000000  2971.000000  
mean      4.468996     3.400962     1.787530     2.674828  
std       2.216520     2.028775     2.052410     2.170476  
min      -1.389284    -4.287164    -5.313206    -3.844328  
25%     

# **QUESTION 1.b**

In [35]:
# Balanced panel regression (no dummies)
model_bal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                          data=df_balanced).fit()
print("OLS (Balanced), no dummies:")
print(model_bal_basic.summary())

OLS (Balanced), no dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.913
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     4496.
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:40:05   Log-Likelihood:                -688.18
No. Observations:                 856   AIC:                             1382.
Df Residuals:                     853   BIC:                             1397.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7235   

In [36]:
# Unbalanced panel regression (no dummies)
model_unbal_basic = smf.ols("ldsal ~ lemp + ldnpt",
                            data=df_unbalanced).fit()
print("\nOLS (Unbalanced), no dummies:")
print(model_unbal_basic.summary())


OLS (Unbalanced), no dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     9519.
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:40:34   Log-Likelihood:                -1780.4
No. Observations:                2115   AIC:                             3567.
Df Residuals:                    2112   BIC:                             3584.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.2322

In [41]:
#Example with balanced panel and dummies
model_bal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr)+C(sic3)",
                            data=df_balanced).fit()
print("\nOLS (Balanced), with dummies:")
print(model_bal_dummies.summary())


OLS (Balanced), with dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.961
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     323.0
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:44:12   Log-Likelihood:                -343.66
No. Observations:                 856   AIC:                             811.3
Df Residuals:                     794   BIC:                             1106.
Df Model:                          61                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept    

In [45]:
# Example with unbalanced panel and dummies:
model_unbal_dummies = smf.ols("ldsal ~ lemp + ldnpt + C(yr) + C(sic3)",
                              data=df_unbalanced).fit()
print("\nOLS (Unbalanced), with year & industry dummies:")
print(model_unbal_dummies.summary())


OLS (Unbalanced), with year & industry dummies:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     240.7
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        02:08:54   Log-Likelihood:                -1293.5
No. Observations:                2115   AIC:                             2835.
Df Residuals:                    1991   BIC:                             3537.
Df Model:                         123                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

# **QUESTION 1.c**

In [50]:
#Estimate the model with firm and time fixed effects (Unbalanced)
model_unbal_fe = smf.ols("ldsal ~ lemp + ldnpt + C(yr) + C(index)",
                                       data=df_unbalanced).fit()
print("\nPanel OLS (Unbalanced), with firm and time fixed effects:")
print(model_unbal_fe.summary())


Panel OLS (Unbalanced), with firm and time fixed effects:
                            OLS Regression Results                            
Dep. Variable:                  ldsal   R-squared:                       0.986
Model:                            OLS   Adj. R-squared:                  0.968
Method:                 Least Squares   F-statistic:                     54.81
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        02:10:21   Log-Likelihood:                 299.68
No. Observations:                2115   AIC:                             1783.
Df Residuals:                     924   BIC:                             8520.
Df Model:                        1190                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [52]:
df_unbalanced.describe()

Unnamed: 0,index,sic3,yr,ldsal,lemp,ldnpt,ldrst,ldrnd,ldinv
count,2115.0,2115.0,2115.0,2115.0,2115.0,2115.0,2115.0,2115.0,2115.0
mean,698.167376,332.691726,80.484634,5.170738,0.792253,3.883376,2.800018,1.207135,2.11057
std,402.843584,53.362947,5.252303,1.777286,1.614334,1.999867,1.738447,1.772027,1.950589
min,1.0,200.0,73.0,-0.857349,-3.772261,-1.389284,-4.287164,-5.313206,-3.844328
25%,345.0,289.0,78.0,3.940751,-0.379067,2.562408,1.554264,-0.054967,0.795217
50%,708.0,356.0,78.0,5.077045,0.672944,3.692343,2.733376,1.177518,2.01307
75%,1054.5,367.0,83.0,6.348836,1.94591,5.194506,3.916602,2.401783,3.414303
max,1400.0,399.0,88.0,11.30435,5.673323,11.06041,8.217877,6.946921,8.988533


# **QUESTION 1.d**

In [55]:
###################### Creating survival variable ######################

# 1. Sort by firm and year
df_unbalanced = df_unbalanced.sort_values(["index","yr"])

# 2. Identify the next year in which the firm appears
#    groupby("index") and shift() the 'yr' column by -1 to get the next row's year
df_unbalanced['next_yr'] = df_unbalanced.groupby('index')['yr'].shift(-1)

# Because your years jump in increments of 5, define survive_next=1 
# if next_yr == yr + 5 (i.e., the next wave). 0 otherwise.
df_unbalanced['survive_next'] = (
    df_unbalanced['next_yr'] == (df_unbalanced['yr'] + 5)
).astype(int)

# For the last observation of each firm (or if a firm does not appear in the next wave),
# survive_next will be 0. If there's no subsequent row for that firm, next_yr is NaN.

# 3. Drop rows with missing survive_next if you prefer to keep only firm-years
#    that can define survival. (Optional)
df_unbalanced = df_unbalanced.dropna(subset=['survive_next'])

print(df_unbalanced[['index','yr','survive_next']].head(15))


    index  yr  survive_next
0       1  73             1
1       1  78             1
2       1  83             0
3       2  78             1
4       2  83             1
5       2  88             0
6       3  78             1
7       3  83             0
8       4  73             1
9       4  78             1
10      4  83             0
11      5  73             0
12      6  88             0
13      7  73             1
14      7  78             1


In [75]:
# Probit depends on your chosen regressors. Suppose you want:
#   survive_next = f(ldinv, ldpt, year dummies, industry dummies)
# Using formula API with statsmodels:

model_formula = "survive_next ~ ldnpt + ldinv + C(yr) + C(sic3)"
probit_model = smf.probit(model_formula, data=df_unbalanced).fit()
print(probit_model.summary())

df_unbalanced['predicted_survival'] = probit_model.predict(df_unbalanced)
df_unbalanced['predicted_survival'].describe()

         Current function value: 0.492959
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:           survive_next   No. Observations:                 2115
Model:                         Probit   Df Residuals:                     1991
Method:                           MLE   Df Model:                          123
Date:                Thu, 27 Feb 2025   Pseudo R-squ.:                  0.2703
Time:                        02:31:46   Log-Likelihood:                -1042.6
converged:                      False   LL-Null:                       -1428.9
Covariance Type:            nonrobust   LLR p-value:                 1.010e-94
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.4917      0.350      1.406      0.160      -0.194       1.177
C(yr)[T.78]       -0.3951      0.084     -4.710      



count    2.115000e+03
mean     4.064520e-01
std      2.608344e-01
min      6.287224e-95
25%      2.264670e-01
50%      4.692472e-01
75%      6.044180e-01
max      1.000000e+00
Name: predicted_survival, dtype: float64

# **QUESTION 1.e**