# **Homework 2: Determinants of Capital Structure**

# Applied Corporate Finance – FINA60223A.H2026

# Prof. Jakub Hajda

# Team: Nguyen-Bao Michael Hoang, Philippe Thériault et Nguyen Quoc-Long Tran

# **Importation of packages**

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
from collections import defaultdict

# **Data Importation** 

In [78]:
#Load the dataset
data = pd.read_csv("Data preparation.csv")

#Display the first rows
data.head()

# Dataset dimensions
data.shape
print(f"Total number of observations: {data.shape[0]}")

# List of columns
data.columns

Total number of observations: 619527


Index(['costat', 'curcd', 'datafmt', 'indfmt', 'consol', 'tic', 'datadate',
       'gvkey', 'sic', 'fyear', 'at', 'dlc', 'dltt', 'intan', 'ppent', 'pstkl',
       'txditc', 'dvc', 'oibdp', 'csho', 'prcc_f'],
      dtype='object')

#### There is 14 variable that we taked from compustat dated from january 1965 to february 2026.

# **1. Exploratory data analysis**

## 1.1. Duplicates, U.S. firms, and sample size

### **Interpretation:**

## 1.2. Data Cleaning and Filters

## 1.3. Cash Flow Volatility

### **Interpretation:**

## 1.4. Summary Statistics: Table I

### *1.4.1. All Firms*

### *1.4.2. Subsample: 1965–2003*

### *1.4.3. Subsample: 2004–Most Recent*

### **Interpretation:**

## 1.5. Persistence and Spurious Correlation (Figure 1) and Lag Structure

### **Interpretation:**

# **2. Leverage Models**

## 2.1 Replication of Table II – Panel A

In [79]:


#  DATA FILTERING 

#  only US firms (curcd = USD) and active companies
data = data[data['curcd'] == 'USD'].copy()

# Remove financial firms (SIC codes 6000-6999) and utilities (4900-4999)
data = data[~((data['sic'] >= 6000) & (data['sic'] <= 6999))].copy()
data = data[~((data['sic'] >= 4900) & (data['sic'] <= 4999))].copy()

# Remove observations with missing key variables
data = data.dropna(subset=['at', 'dlc', 'dltt', 'fyear', 'gvkey', 'sic'])

# Remove observations with non-positive total assets
data = data[data['at'] > 0].copy()


# STEP 2: VARIABLE CONSTRUCTION

# Total Debt = short-term debt (dlc) + long-term debt (dltt)
data['total_debt'] = data['dlc'].fillna(0) + data['dltt'].fillna(0)

# Book Leverage = total debt / total assets
data['book_leverage'] = data['total_debt'] / data['at']

# Market Equity = stock price * shares outstanding
data['market_equity'] = data['prcc_f'] * data['csho']

# Market Leverage = total debt / (total debt + market equity)
data['market_leverage'] = data['total_debt'] / (data['total_debt'] + data['market_equity'])
data.loc[data['market_equity'].isna() | (data['market_equity'] <= 0), 'market_leverage'] = np.nan

# Firm Size = log(book assets) - using total assets as proxy for size
data['log_assets'] = np.log(data['at'])

# Profitability = operating income before depreciation / book assets
data['profitability'] = data['oibdp'] / data['at']

# Tangibility = net PPE / book assets
data['tangibility'] = data['ppent'] / data['at']

# Market-to-Book = (market equity + total debt + preferred stock liquidating value - deferred taxes) / book assets
data['market_to_book'] = (data['market_equity'] + data['total_debt'] +   data['pstkl'].fillna(0) - data['txditc'].fillna(0)) / data['at']

# Dividend Payer dummy (1 if firm pays dividends, 0 otherwise)
data['dividend_payer'] = (data['dvc'].fillna(0) > 0).astype(int)

# Intangible assets ratio
data['intangible_assets'] = data['intan'].fillna(0) / data['at']

# Industry code (4-digit SIC)
data['sic4'] = data['sic'].astype(int)

# Calculate Industry Median Book Leverage
data['ind_med_book_lev'] = data.groupby(['fyear', 'sic4'])['book_leverage'].transform('median')


# STEP 3: TRIMMING OUTLIERS 


# Trim extreme values (1st and 99th percentile)
def trim_variable(df, var_name, lower=0.01, upper=0.99):
    q_low = df[var_name].quantile(lower)
    q_high = df[var_name].quantile(upper)
    df = df[(df[var_name] >= q_low) & (df[var_name] <= q_high)]
    return df

# Apply trimming to key variables
for var in ['book_leverage', 'market_leverage', 'profitability', 'tangibility', 'market_to_book', 'log_assets']:
    if var in data.columns:
        data = trim_variable(data, var)

# Keep leverage between 0 and 1
data = data[(data['book_leverage'] >= 0) & (data['book_leverage'] <= 1)]
# Fixed: Proper parentheses for market leverage filter
data = data[((data['market_leverage'] >= 0) & (data['market_leverage'] <= 1)) | data['market_leverage'].isna()]


#  CREATE LAG VARIABLES (t-1 values for explanatory variables)


# Sort data
data = data.sort_values(['gvkey', 'fyear'])

# Create lagged variables
lag_vars = ['log_assets', 'market_to_book', 'profitability', 'tangibility', 
            'dividend_payer', 'ind_med_book_lev', 'intangible_assets']

for var in lag_vars:
    data[f'{var}_lag'] = data.groupby('gvkey')[var].shift(1)

# Drop observations with missing lagged values
data = data.dropna(subset=[f'{var}_lag' for var in lag_vars])


# INITIAL LEVERAGE CALCULATION

# For each firm, identify the first year in the sample
first_year = data.groupby('gvkey')['fyear'].transform('min')
data['initial_leverage'] = np.where(data['fyear'] == first_year, data['book_leverage'], np.nan)
data['initial_leverage'] = data.groupby('gvkey')['initial_leverage'].transform('first')


# CREATE INDUSTRY-YEAR INTERACTION


data['ind_year'] = data['sic4'].astype(str) + '_' + data['fyear'].astype(str)

# Set panel index
data['firm_id'] = pd.Categorical(data['gvkey']).codes
data['year_id'] = pd.Categorical(data['fyear']).codes

print(f"Final sample size: {len(data):,} observations")
print(f"Number of unique firms: {data['gvkey'].nunique():,}")
print(f"Year range: {data['fyear'].min()} - {data['fyear'].max()}")
print()


# TABLE II PANEL A: Effect of Initial Leverage on Future Leverage

# Compute standardized coefficients (scaled by standard deviation)

# Variables for regression (similar to Table II)
X_vars = ['log_assets_lag', 'market_to_book_lag', 'profitability_lag', 
          'tangibility_lag', 'ind_med_book_lev_lag', 'dividend_payer_lag', 'initial_leverage']

# Store results
table_2_results = {}

for lev_type in ['book_leverage', 'market_leverage']:
    # Prepare regression data
    reg_data = data[['gvkey', 'fyear', lev_type] + X_vars].dropna()
    
    # Calculate standard deviations for standardization
    std_dict = {}
    for var in X_vars + [lev_type]:
        std_dict[var] = reg_data[var].std()
    
    # Standardize variables
    reg_data_std = reg_data.copy()
    for var in X_vars + [lev_type]:
        reg_data_std[var] = reg_data[var] / std_dict[var]
    
    # Run OLS with standardized variables
    X = sm.add_constant(reg_data_std[X_vars])
    y = reg_data_std[lev_type]
    
    model = sm.OLS(y, X).fit()
    
    # Store results
    table_2_results[lev_type] = {
        'params': model.params[1:],  # Exclude constant
        'std_err': model.bse[1:],
        't_stat': model.tvalues[1:],
        'r_squared': model.rsquared,
        'n_obs': model.nobs
    }

# Create Table II Panel A

print("TABLE II - PANEL A: Effect of Initial Leverage on Future Leverage")
print("(Coefficients scaled by standard deviation)")
print()

# Create formatted table
var_names_display = {
    'log_assets_lag': 'Log(Assets)',
    'market_to_book_lag': 'Market-to-Book',
    'profitability_lag': 'Profitability',
    'tangibility_lag': 'Tangibility',
    'ind_med_book_lev_lag': 'Industry Median Leverage',
    'dividend_payer_lag': 'Dividend Payer',
    'initial_leverage': 'Initial Leverage'
}

print(f"{'Variable':<30} {'Book Leverage':>20} {'Market Leverage':>20}")
print("-" * 70)

for var in X_vars:
    book_coef = table_2_results['book_leverage']['params'][var]
    book_tstat = table_2_results['book_leverage']['t_stat'][var]
    mkt_coef = table_2_results['market_leverage']['params'][var]
    mkt_tstat = table_2_results['market_leverage']['t_stat'][var]
    
    display_name = var_names_display.get(var, var)
    print(f"{display_name:<30} {book_coef:>10.4f} ({book_tstat:>6.2f})   {mkt_coef:>10.4f} ({mkt_tstat:>6.2f})")

print("-" * 70)
print(f"{'R-squared':<30} {table_2_results['book_leverage']['r_squared']:>20.4f} {table_2_results['market_leverage']['r_squared']:>20.4f}")
print(f"{'N observations':<30} {int(table_2_results['book_leverage']['n_obs']):>20,} {int(table_2_results['market_leverage']['n_obs']):>20,}")
print()
print("Note: t-statistics in parentheses")


Final sample size: 219,679 observations
Number of unique firms: 20,425
Year range: 1965 - 2025

TABLE II - PANEL A: Effect of Initial Leverage on Future Leverage
(Coefficients scaled by standard deviation)

Variable                              Book Leverage      Market Leverage
----------------------------------------------------------------------
Log(Assets)                        0.1225 ( 64.00)       0.0577 ( 30.24)
Market-to-Book                    -0.0476 (-25.64)      -0.2732 (-147.52)
Profitability                     -0.0712 (-36.55)      -0.0539 (-27.76)
Tangibility                        0.1198 ( 63.35)       0.1109 ( 58.87)
Industry Median Leverage           0.1791 ( 94.51)       0.1741 ( 92.15)
Dividend Payer                    -0.0925 (-47.83)      -0.0729 (-37.82)
Initial Leverage                   0.4347 (235.19)       0.3476 (188.66)
----------------------------------------------------------------------
R-squared                                    0.3455               

## 2.2 Baseline Leverage Regressions
### $$\text{Leverage}_{it} = \alpha + \beta X_{i,t-1} + \varepsilon_{it}$$

### *2.2.1. (a) Pooled OLS*

In [80]:


# Variables for regression (
X_vars_baseline = ['log_assets_lag', 'market_to_book_lag', 'profitability_lag', 
                   'tangibility_lag', 'ind_med_book_lev_lag', 'dividend_payer_lag']

# Store all regression results for comparison
all_results = defaultdict(dict)

for lev_type in ['book_leverage', 'market_leverage']:
    # Prepare regression data
    reg_data = data[['gvkey', 'fyear', 'firm_id', lev_type] + X_vars_baseline].dropna()
    
    # Pooled OLS
    X = sm.add_constant(reg_data[X_vars_baseline])
    y = reg_data[lev_type]
    
    model_pooled = sm.OLS(y, X).fit()
    
    # Store results
    all_results[lev_type]['pooled_ols'] = {
        'model': model_pooled,
        'params': model_pooled.params,
        'std_err': model_pooled.bse,
        't_stat': model_pooled.tvalues,
        'r_squared': model_pooled.rsquared,
        'r_squared_adj': model_pooled.rsquared_adj,
        'n_obs': model_pooled.nobs
    }

# Display results for Pooled OLS


for lev_type in ['book_leverage', 'market_leverage']:
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    results = all_results[lev_type]['pooled_ols']
    
    print(f"\nDependent Variable: {lev_name}")
    print("-" * 60)
    print(f"{'Variable':<30} {'Coefficient':>15} {'t-stat':>12}")
    print("-" * 60)
    
    for var in ['const'] + X_vars_baseline:
        display_name = var_names_display.get(var, var) if var != 'const' else 'Constant'
        coef = results['params'][var]
        tstat = results['t_stat'][var]
        print(f"{display_name:<30} {coef:>15.6f} {tstat:>12.2f}")
    
    print("-" * 60)
    print(f"R-squared: {results['r_squared']:.4f}")
    print(f"Adj. R-squared: {results['r_squared_adj']:.4f}")
    print(f"N observations: {int(results['n_obs']):,}")
    print()



Dependent Variable: Book Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Constant                              0.085527        68.69
Log(Assets)                           0.012439        64.97
Market-to-Book                       -0.008981       -39.39
Profitability                        -0.046558       -29.26
Tangibility                           0.173261        90.90
Industry Median Leverage              0.367367       137.89
Dividend Payer                       -0.051091       -55.96
------------------------------------------------------------
R-squared: 0.1807
Adj. R-squared: 0.1806
N observations: 219,679


Dependent Variable: Market Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Constant         

### *2.2.2. (b) Firm Fixed Effects*


In [81]:


for lev_type in ['book_leverage', 'market_leverage']:
    # Prepare regression data
    reg_data = data[['gvkey', 'fyear', 'firm_id', lev_type] + X_vars_baseline].dropna().copy()
    
    # Create year dummies with dtype=int to ensures consistent types
    
    #  dtype=int ensures consistent types.
    year_dummies = pd.get_dummies(reg_data['fyear'].astype(int), prefix='year', drop_first=True, dtype=int)
    
    # Combine X variables with year dummies - reset indices to ensure alignment
    X = pd.concat([reg_data[X_vars_baseline].reset_index(drop=True),  year_dummies.reset_index(drop=True)], axis=1)
    X = sm.add_constant(X)
    y = reg_data[lev_type].reset_index(drop=True)
    
    model_year_fe = sm.OLS(y, X).fit()
    
    # Store results
    all_results[lev_type]['year_fe'] = {
        'model': model_year_fe,
        'params': model_year_fe.params,
        'std_err': model_year_fe.bse,
        't_stat': model_year_fe.tvalues,
        'r_squared': model_year_fe.rsquared,
        'r_squared_adj': model_year_fe.rsquared_adj,
        'n_obs': model_year_fe.nobs
    }

# Display Year FE results

for lev_type in ['book_leverage', 'market_leverage']:
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    results = all_results[lev_type]['year_fe']
    
    print(f"\nDependent Variable: {lev_name}")
    print("-" * 60)
    print(f"{'Variable':<30} {'Coefficient':>15} {'t-stat':>12}")
    print("-" * 60)
    
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        coef = results['params'][var]
        tstat = results['t_stat'][var]
        print(f"{display_name:<30} {coef:>15.6f} {tstat:>12.2f}")
    
    print("-" * 60)
    print(f"Year Fixed Effects: Yes")
    print(f"R-squared: {results['r_squared']:.4f}")
    print(f"Adj. R-squared: {results['r_squared_adj']:.4f}")
    print(f"N observations: {int(results['n_obs']):,}")
    print()



Dependent Variable: Book Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(Assets)                           0.018324        79.81
Market-to-Book                       -0.008377       -36.42
Profitability                        -0.065376       -39.61
Tangibility                           0.160600        83.64
Industry Median Leverage              0.349229       130.34
Dividend Payer                       -0.069282       -70.59
------------------------------------------------------------
Year Fixed Effects: Yes
R-squared: 0.1954
Adj. R-squared: 0.1951
N observations: 219,679


Dependent Variable: Market Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(Assets)                           0.020514       

### *2.2.3. (c) Firm and Year Fixed Effects*

In [82]:

for lev_type in ['book_leverage', 'market_leverage']:
    
    # Prepare panel data
    reg_data = data[['gvkey', 'fyear', lev_type] + X_vars_baseline].dropna().copy()
    
    #  multi-index for panel data
    reg_data = reg_data.set_index(['gvkey', 'fyear'])
    
    # Prepare dependent and independent variables
    y = reg_data[lev_type]
    X = reg_data[X_vars_baseline]
    
    # Estimate panel model with firm and time effects (two-way fixed effects)
    try:
        model_firm_year_fe = PanelOLS(y, X, entity_effects=True, time_effects=True).fit()
        
        # Store results
        all_results[lev_type]['firm_year_fe'] = {
            'model': model_firm_year_fe,
            'params': model_firm_year_fe.params,
            'std_err': model_firm_year_fe.std_errors,
            't_stat': model_firm_year_fe.tstats,
            'r_squared': model_firm_year_fe.rsquared,
            'r_squared_within': model_firm_year_fe.rsquared_within,
            'n_obs': model_firm_year_fe.nobs
        }
    except Exception as e:
        print(f"Error fitting firm+year FE for {lev_type}: {e}")
        continue

# Display Firm + Year FE results

for lev_type in ['book_leverage', 'market_leverage']:
    if 'firm_year_fe' not in all_results[lev_type]:
        continue
    
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    results = all_results[lev_type]['firm_year_fe']
    
    print(f"\nDependent Variable: {lev_name}")
    print("-" * 60)
    print(f"{'Variable':<30} {'Coefficient':>15} {'t-stat':>12}")
    print("-" * 60)
    
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        coef = results['params'][var]
        tstat = results['t_stat'][var]
        print(f"{display_name:<30} {coef:>15.6f} {tstat:>12.2f}")
    
    print("-" * 60)
    print(f"Firm Fixed Effects: Yes")
    print(f"Year Fixed Effects: Yes")
    print(f"R-squared (within): {results['r_squared_within']:.4f}")
    print(f"R-squared (overall): {results['r_squared']:.4f}")
    print(f"N observations: {int(results['n_obs']):,}")
    print()


Dependent Variable: Book Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(Assets)                           0.027922        59.85
Market-to-Book                       -0.003900       -17.50
Profitability                        -0.080205       -45.15
Tangibility                           0.174886        55.30
Industry Median Leverage              0.121675        43.33
Dividend Payer                       -0.031407       -28.26
------------------------------------------------------------
Firm Fixed Effects: Yes
Year Fixed Effects: Yes
R-squared (within): 0.0588
R-squared (overall): 0.0554
N observations: 219,679


Dependent Variable: Market Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(Assets)    

### *2.2.4. (d) Firm and Industry × Year Fixed Effects*

In [83]:


# Helper function to safely get parameter values
def get_param_value(params, var_name):
    
    if params is None:
        return np.nan
    if isinstance(params, dict):
        return params.get(var_name, np.nan)
    if hasattr(params, 'get'):
        return params.get(var_name, np.nan)
    try:
        return params[var_name]
    except (KeyError, IndexError, TypeError):
        return np.nan

for lev_type in ['book_leverage', 'market_leverage']:
    # Prepare panel data - keep ind_year before setting index
    reg_data = data[['gvkey', 'fyear', 'ind_year', lev_type] + X_vars_baseline].dropna().copy()
    
    # Store ind_year before setting panel index
    ind_year_series = reg_data['ind_year'].copy()
    
    # Set multi-index for panel data
    reg_data = reg_data.set_index(['gvkey', 'fyear'])
    
    # Prepare dependent and independent variables
    y = reg_data[lev_type]
    X = reg_data[X_vars_baseline]
    
    # Align ind_year with panel index
    ind_year_aligned = pd.Series(ind_year_series.values, index=reg_data.index)
    
    
        # Firm FE with Industry×Year effects as other effects
    model_firm_indyear_fe = PanelOLS(y, X, entity_effects=True, time_effects=False, other_effects=ind_year_aligned).fit(low_memory=True, use_lsmr=True)
    
    all_results[lev_type]['firm_indyear_fe'] = {
        'model': model_firm_indyear_fe,
        'params': model_firm_indyear_fe.params,
        'std_err': model_firm_indyear_fe.std_errors,
        't_stat': model_firm_indyear_fe.tstats,
        'r_squared': model_firm_indyear_fe.rsquared,
        'r_squared_within': model_firm_indyear_fe.rsquared_within,
        'n_obs': model_firm_indyear_fe.nobs
    }

# Display Firm + Industry×Year FE results


for lev_type in ['book_leverage', 'market_leverage']:
    if 'firm_indyear_fe' not in all_results[lev_type] or all_results[lev_type]['firm_indyear_fe'] is None:
        continue
    
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    results = all_results[lev_type]['firm_indyear_fe']
    
    print(f"\nDependent Variable: {lev_name}")
    print("-" * 60)
    print(f"{'Variable':<30} {'Coefficient':>15} {'t-stat':>12}")
    print("-" * 60)
    
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        coef = get_param_value(results['params'], var)
        tstat = get_param_value(results['t_stat'], var)
        print(f"{display_name:<30} {coef:>15.6f} {tstat:>12.2f}")
    
    print("-" * 60)
    print(f"Firm Fixed Effects: Yes")
    print(f"Industry×Year Fixed Effects: Yes")
    print(f"R-squared (within): {results['r_squared_within']:.4f}")
    print(f"R-squared (overall): {results['r_squared']:.4f}")
    print(f"N observations: {int(results['n_obs']):,}")
    print()




Dependent Variable: Book Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(Assets)                           0.028443        55.22
Market-to-Book                       -0.003349       -14.27
Profitability                        -0.076694       -41.74
Tangibility                           0.173980        50.76
Industry Median Leverage             -0.058368        -2.91
Dividend Payer                       -0.031021       -25.97
------------------------------------------------------------
Firm Fixed Effects: Yes
Industry×Year Fixed Effects: Yes
R-squared (within): 0.0336
R-squared (overall): 0.0422
N observations: 219,679


Dependent Variable: Market Leverage
------------------------------------------------------------
Variable                           Coefficient       t-stat
------------------------------------------------------------
Log(As

### **Interpretation:**

## 2.3 Economic Interpretation of the Results

In [84]:

# 2.3 Economic Interpretation of the Results


# Calculate economic magnitude using summary statistics
summary_stats = data[X_vars_baseline + ['book_leverage', 'market_leverage']].describe()


print("SUMMARY STATISTICS FOR ECONOMIC INTERPRETATION")

print(summary_stats.T[['mean', 'std', '25%', '50%', '75%']].round(4))
print()

# Economic significance: What is the effect of a one-standard deviation change?

print("ECONOMIC SIGNIFICANCE: Effect of 1-SD Change in Each Variable")


for lev_type in ['book_leverage', 'market_leverage']:
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    print(f"\n{lev_name}:")
    print("-" * 60)
    print(f"{'Variable':<30} {'SD':>10} {'Coef (Pooled)':>15} {'Effect':>12}")
    print("-" * 60)
    
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        sd = summary_stats.loc[var, 'std'] if var in summary_stats.index else data[var].std()
        coef = get_param_value(all_results[lev_type]['pooled_ols']['params'], var)
        effect = sd * coef
        print(f"{display_name:<30} {sd:>10.4f} {coef:>15.6f} {effect:>12.4f}")
    
    mean_lev = data[lev_type].mean()
    print(f"\nMean {lev_name}: {mean_lev:.4f}")
    print()





SUMMARY STATISTICS FOR ECONOMIC INTERPRETATION
                        mean     std     25%     50%     75%
log_assets_lag        4.9316  2.2893  3.2129  4.7206  6.5259
market_to_book_lag    1.7297  1.8615  0.7476  1.1177  1.9189
profitability_lag     0.0409  0.2804  0.0231  0.1089  0.1700
tangibility_lag       0.2810  0.2244  0.0986  0.2253  0.4101
ind_med_book_lev_lag  0.2200  0.1584  0.1241  0.2166  0.2943
dividend_payer_lag    0.3761  0.4844  0.0000  0.0000  1.0000
book_leverage         0.2397  0.2048  0.0562  0.2136  0.3665
market_leverage       0.2499  0.2416  0.0344  0.1826  0.4042

ECONOMIC SIGNIFICANCE: Effect of 1-SD Change in Each Variable

Book Leverage:
------------------------------------------------------------
Variable                               SD   Coef (Pooled)       Effect
------------------------------------------------------------
Log(Assets)                        2.2893        0.012439       0.0285
Market-to-Book                     1.8615       -0.008981    

### **Interpretation:**

## 2.4 Firm vs. Industry Fixed Effects

In [85]:


#  model with Industry FE (instead of Firm FE)
for lev_type in ['book_leverage', 'market_leverage']:
    # Prepare regression data
    reg_data = data[['gvkey', 'fyear', 'sic4', lev_type] + X_vars_baseline].dropna().copy()
    
    # Create industry dummies (convert to string for clarity, then to int dtype)
    industry_dummies = pd.get_dummies(reg_data['sic4'].astype(str), prefix='ind', drop_first=True, dtype=int)
    
    # Create year dummies 
    year_dummies = pd.get_dummies(reg_data['fyear'].astype(int), prefix='year', drop_first=True, dtype=int)
    
    # Combine X variables with industry and year dummies 
    X = pd.concat([reg_data[X_vars_baseline].reset_index(drop=True), industry_dummies.reset_index(drop=True), year_dummies.reset_index(drop=True)], axis=1)
    X = sm.add_constant(X)
    y = reg_data[lev_type].reset_index(drop=True)
    
    model_ind_year_fe = sm.OLS(y, X).fit()
    
    all_results[lev_type]['industry_year_fe'] = {
        'model': model_ind_year_fe,
        'params': model_ind_year_fe.params,
        'std_err': model_ind_year_fe.bse,
        't_stat': model_ind_year_fe.tvalues,
        'r_squared': model_ind_year_fe.rsquared,
        'r_squared_adj': model_ind_year_fe.rsquared_adj,
        'n_obs': model_ind_year_fe.nobs
    }

# Comparison Table: Industry FE vs Firm FE

print("COMPARISON: INDUSTRY FIXED EFFECTS vs FIRM FIXED EFFECTS")


for lev_type in ['book_leverage', 'market_leverage']:
    lev_name = 'Book Leverage' if lev_type == 'book_leverage' else 'Market Leverage'
    print(f"\n{lev_name}:")
    print("-" * 80)
    print(f"{'Variable':<25} {'Industry+Year FE':>20} {'Firm+Year FE':>20} {'Difference':>15}")
    print("-" * 80)
    
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        ind_fe = get_param_value(all_results[lev_type]['industry_year_fe']['params'], var)
        firm_fe = get_param_value(all_results[lev_type].get('firm_year_fe', {}).get('params'), var)
        
        diff = firm_fe - ind_fe if not (np.isnan(firm_fe) or np.isnan(ind_fe)) else np.nan
        print(f"{display_name:<25} {ind_fe:>20.6f} {firm_fe:>20.6f} {diff:>15.6f}")
    
    print("-" * 80)
    ind_r2 = all_results[lev_type]['industry_year_fe']['r_squared']
    firm_r2 = all_results[lev_type].get('firm_year_fe', {}).get('r_squared', np.nan)
    print(f"{'R-squared':<25} {ind_r2:>20.4f} {firm_r2:>20.4f}")
    print()





COMPARISON: INDUSTRY FIXED EFFECTS vs FIRM FIXED EFFECTS

Book Leverage:
--------------------------------------------------------------------------------
Variable                      Industry+Year FE         Firm+Year FE      Difference
--------------------------------------------------------------------------------
Log(Assets)                           0.018642             0.027922        0.009280
Market-to-Book                       -0.006984            -0.003900        0.003083
Profitability                        -0.076759            -0.080205       -0.003446
Tangibility                           0.191684             0.174886       -0.016797
Industry Median Leverage              0.194774             0.121675       -0.073098
Dividend Payer                       -0.074218            -0.031407        0.042811
--------------------------------------------------------------------------------
R-squared                               0.2296               0.0554


Market Leverage:
---------

### **Interpretation:**

## 2.5 Clustering of Standard Errors

In [86]:

# Book Leverage 
lev_type = 'book_leverage'
reg_data = data[['gvkey', 'fyear', 'sic4', lev_type] + X_vars_baseline].dropna().copy()

# Prepare panel data
reg_data = reg_data.set_index(['gvkey', 'fyear'])
y = reg_data[lev_type]
X = reg_data[X_vars_baseline]

# Model Firm-clustered standard errors
model_firm_cluster = PanelOLS(y, X, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True)

# Model Non-clustered for comparison
model_no_cluster = PanelOLS(y, X, entity_effects=True, time_effects=True).fit()

# Store clustering results
clustering_results = {
    'no_cluster': {
        'params': model_no_cluster.params,
        'std_err': model_no_cluster.std_errors,
        't_stat': model_no_cluster.tstats
    },
    'firm_cluster': {
        'params': model_firm_cluster.params,
        'std_err': model_firm_cluster.std_errors,
        't_stat': model_firm_cluster.tstats
    }
}

# Display results
print("STANDARD ERROR CLUSTERING COMPARISON (Book Leverage with Firm+Year FE)")
print()
print(f"{'Variable':<25} {'No Clustering':>20} {'Firm Cluster':>20}")
print(f"{'':<25} {'Coef (t-stat)':>20} {'Coef (t-stat)':>20}")
print("-" * 100)

for var in X_vars_baseline:
    display_name = var_names_display.get(var, var)
    
    coef_nc = clustering_results['no_cluster']['params'][var]
    tstat_nc = clustering_results['no_cluster']['t_stat'][var]
    coef_fc = clustering_results['firm_cluster']['params'][var]
    tstat_fc = clustering_results['firm_cluster']['t_stat'][var]

    print(f"{display_name:<25} {coef_nc:>9.4f} ({tstat_nc:>6.2f})    {coef_fc:>9.4f} ({tstat_fc:>6.2f})")

print("-" * 100)

# Significance changes
print("SIGNIFICANCE ANALYSIS (p < 0.05)")
print()

for var in X_vars_baseline:
    display_name = var_names_display.get(var, var)
    
    tstat_nc = clustering_results['no_cluster']['t_stat'][var]
    tstat_fc = clustering_results['firm_cluster']['t_stat'][var]
    sig_nc = "***" if abs(tstat_nc) > 2.576 else "**" if abs(tstat_nc) > 1.96 else "*" if abs(tstat_nc) > 1.645 else ""
    sig_fc = "***" if abs(tstat_fc) > 2.576 else "**" if abs(tstat_fc) > 1.96 else "*" if abs(tstat_fc) > 1.645 else ""
    
    change = "SAME" if sig_nc == sig_fc else f"CHANGED ({sig_nc} -> {sig_fc})"
    
    print(f"{display_name:<30}: No cluster: {sig_nc:<5} | Firm cluster: {sig_fc:<5} | {change}")

print("\n" + "=" * 80)


STANDARD ERROR CLUSTERING COMPARISON (Book Leverage with Firm+Year FE)

Variable                         No Clustering         Firm Cluster
                                 Coef (t-stat)        Coef (t-stat)
----------------------------------------------------------------------------------------------------
Log(Assets)                  0.0279 ( 59.85)       0.0279 ( 20.00)
Market-to-Book              -0.0039 (-17.50)      -0.0039 ( -8.90)
Profitability               -0.0802 (-45.15)      -0.0802 (-21.28)
Tangibility                  0.1749 ( 55.30)       0.1749 ( 19.41)
Industry Median Leverage     0.1217 ( 43.33)       0.1217 (  2.08)
Dividend Payer              -0.0314 (-28.26)      -0.0314 (-13.00)
----------------------------------------------------------------------------------------------------
SIGNIFICANCE ANALYSIS (p < 0.05)

Log(Assets)                   : No cluster: ***   | Firm cluster: ***   | SAME
Market-to-Book                : No cluster: ***   | Firm cluster: ***   | S

### **Interpretation:**

## 2.6 Robustness Test

In [87]:



print()

# Robustness Test 1: Alternative sample periods
print("Test 1: SUBSAMPLE ANALYSIS BY TIME PERIOD")
print("-" * 60)

# Split data into pre-2008 and post-2008 (financial crisis)
data_pre2008 = data[data['fyear'] < 2008].copy()
data_post2008 = data[data['fyear'] >= 2008].copy()

for period_name, period_data in [('Pre-2008', data_pre2008), ('Post-2008', data_post2008)]:
    reg_data = period_data[['gvkey', 'fyear', 'book_leverage'] + X_vars_baseline].dropna()
    reg_data = reg_data.set_index(['gvkey', 'fyear'])
    
    y = reg_data['book_leverage']
    X = reg_data[X_vars_baseline]
    
    
    model = PanelOLS(y, X, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True)
    
    print(f"\n{period_name} ({len(reg_data):,} obs, {reg_data.index.get_level_values(0).nunique():,} firms):")
    for var in X_vars_baseline:
        display_name = var_names_display.get(var, var)
        coef = model.params[var]
        tstat = model.tstats[var]
        sig = "***" if abs(tstat) > 2.576 else "**" if abs(tstat) > 1.96 else "*" if abs(tstat) > 1.645 else ""
        print(f"  {display_name:<25}: {coef:>10.4f} ({tstat:>6.2f}){sig}")
    

# Robustness Test 2: Winsorization vs Trimming

print("Test 2: WINSORIZATION vs TRIMMING COMPARISON")


# Re-prepare data with winsorization instead of trimming
data_winsorize = pd.read_csv("Data preparation.csv")
data_winsorize = data_winsorize[data_winsorize['curcd'] == 'USD'].copy()
data_winsorize = data_winsorize[~((data_winsorize['sic'] >= 6000) & (data_winsorize['sic'] <= 6999))].copy()
data_winsorize = data_winsorize[~((data_winsorize['sic'] >= 4900) & (data_winsorize['sic'] <= 4999))].copy()
data_winsorize = data_winsorize.dropna(subset=['at', 'dlc', 'dltt', 'fyear', 'gvkey', 'sic'])
data_winsorize = data_winsorize[data_winsorize['at'] > 0].copy()

# Recalculate variables
data_winsorize['total_debt'] = data_winsorize['dlc'].fillna(0) + data_winsorize['dltt'].fillna(0)
data_winsorize['book_leverage'] = data_winsorize['total_debt'] / data_winsorize['at']
data_winsorize['market_equity'] = data_winsorize['prcc_f'] * data_winsorize['csho']
data_winsorize['log_assets'] = np.log(data_winsorize['at'])
data_winsorize['profitability'] = data_winsorize['oibdp'] / data_winsorize['at']
data_winsorize['tangibility'] = data_winsorize['ppent'] / data_winsorize['at']
data_winsorize['market_to_book'] = (data_winsorize['market_equity'] + data_winsorize['total_debt'] + 
                          data_winsorize['pstkl'].fillna(0) - data_winsorize['txditc'].fillna(0)) / data_winsorize['at']
data_winsorize['dividend_payer'] = (data_winsorize['dvc'].fillna(0) > 0).astype(int)
data_winsorize['sic4'] = data_winsorize['sic'].astype(int)
data_winsorize['ind_med_book_lev'] = data_winsorize.groupby(['fyear', 'sic4'])['book_leverage'].transform('median')

# Winsorize instead of trim
def winsorize_variable(df, var_name, lower=0.01, upper=0.99):
    q_low = df[var_name].quantile(lower)
    q_high = df[var_name].quantile(upper)
    df[var_name] = df[var_name].clip(lower=q_low, upper=q_high)
    return df

for var in ['book_leverage', 'profitability', 'tangibility', 'market_to_book', 'log_assets']:
    if var in data_winsorize.columns:
        data_winsorize = winsorize_variable(data_winsorize, var)

# Clip leverage to [0, 1]
data_winsorize['book_leverage'] = data_winsorize['book_leverage'].clip(0, 1)

# Create lagged variables
data_winsorize = data_winsorize.sort_values(['gvkey', 'fyear'])
for var in ['log_assets', 'market_to_book', 'profitability', 'tangibility', 
            'dividend_payer', 'ind_med_book_lev']:
    data_winsorize[f'{var}_lag'] = data_winsorize.groupby('gvkey')[var].shift(1)

data_winsorize = data_winsorize.dropna(subset=[f'{var}_lag' for var in ['log_assets', 'market_to_book', 'profitability', 'tangibility', 
            'dividend_payer', 'ind_med_book_lev']])

# Run regression
reg_data_wins = data_winsorize[['gvkey', 'fyear', 'book_leverage'] + X_vars_baseline].dropna()
reg_data_wins = reg_data_wins.set_index(['gvkey', 'fyear'])


model_wins = PanelOLS(reg_data_wins['book_leverage'], reg_data_wins[X_vars_baseline], 
                        entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True)

print("\nWinsorized Sample Results:")
for var in X_vars_baseline:
    display_name = var_names_display.get(var, var)
    coef = model_wins.params[var]
    tstat = model_wins.tstats[var]
    sig = "***" if abs(tstat) > 2.576 else "**" if abs(tstat) > 1.96 else "*" if abs(tstat) > 1.645 else ""
    print(f"  {display_name:<25}: {coef:>10.4f} ({tstat:>6.2f}){sig}")


# Robustness Test 3: Alternative Leverage Measure
print("\n" + "-" * 60)
print("Test 3: ALTERNATIVE LEVERAGE DEFINITION (Long-term debt only)")
print("-" * 60)

# Use only long-term debt in leverage calculation
data['lt_book_leverage'] = data['dltt'].fillna(0) / data['at']
data['lt_book_leverage'] = data['lt_book_leverage'].clip(0, 1)

reg_data_lt = data[['gvkey', 'fyear', 'lt_book_leverage'] + X_vars_baseline].dropna()
reg_data_lt = reg_data_lt.set_index(['gvkey', 'fyear'])


model_lt = PanelOLS(reg_data_lt['lt_book_leverage'], reg_data_lt[X_vars_baseline], 
                    entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True)

print("\nLong-term Leverage Only:")
for var in X_vars_baseline:
    display_name = var_names_display.get(var, var)
    coef = model_lt.params[var]
    tstat = model_lt.tstats[var]
    sig = "***" if abs(tstat) > 2.576 else "**" if abs(tstat) > 1.96 else "*" if abs(tstat) > 1.645 else ""
    print(f"  {display_name:<25}: {coef:>10.4f} ({tstat:>6.2f}){sig}")

print("\n" + "=" * 80)




Test 1: SUBSAMPLE ANALYSIS BY TIME PERIOD
------------------------------------------------------------

Pre-2008 (157,177 obs, 15,871 firms):
  Log(Assets)              :     0.0293 ( 18.83)***
  Market-to-Book           :    -0.0049 ( -9.69)***
  Profitability            :    -0.0916 (-17.90)***
  Tangibility              :     0.1682 ( 17.37)***
  Industry Median Leverage :     0.1915 (  3.35)***
  Dividend Payer           :    -0.0319 (-12.48)***

Post-2008 (62,502 obs, 8,579 firms):
  Log(Assets)              :     0.0302 ( 11.52)***
  Market-to-Book           :    -0.0011 ( -1.51)
  Profitability            :    -0.0589 (-10.40)***
  Tangibility              :     0.2144 ( 12.39)***
  Industry Median Leverage :     0.0444 (  1.22)
  Dividend Payer           :    -0.0088 ( -2.34)**
Test 2: WINSORIZATION vs TRIMMING COMPARISON

Winsorized Sample Results:
  Log(Assets)              :     0.0106 (  6.68)***
  Market-to-Book           :    -0.0004 ( -1.68)*
  Profitability            

### **Interpretation:**

## 2.7 Variance Decomposition (Bonus) 
### $$\text{Leverage}_{ijt} = \alpha + \beta X_{ij,t-1} + \eta_i + \nu_{jt} + \varepsilon_{ijt}$$



In [88]:


print("VARIANCE DECOMPOSITION ANALYSIS")
print()

# Prepare data - keep sic4 before setting index
reg_data = data[['gvkey', 'fyear', 'sic4', 'book_leverage'] + X_vars_baseline].dropna().copy()

#  Variance components
total_var = reg_data['book_leverage'].var()
print(f"Total Variance in Book Leverage: {total_var:.6f}")
print()

# Variance explained by firm fixed effects
firm_means = reg_data.groupby('gvkey')['book_leverage'].transform('mean')
var_firm = firm_means.var()
pct_firm = (var_firm / total_var) * 100

 # Variance explained by year fixed effects  
year_means = reg_data.groupby('fyear')['book_leverage'].transform('mean')
var_year = year_means.var()
pct_year = (var_year / total_var) * 100

# Variance explained by industry fixed effects
industry_means = reg_data.groupby('sic4')['book_leverage'].transform('mean')
var_industry = industry_means.var()
pct_industry = (var_industry / total_var) * 100

# Variance explained by industry×year interaction
ind_year_means = reg_data.groupby(['sic4', 'fyear'])['book_leverage'].transform('mean')
var_ind_year = ind_year_means.var()
pct_ind_year = (var_ind_year / total_var) * 100

# Variance explained by X variables
X = sm.add_constant(reg_data[X_vars_baseline])
y = reg_data['book_leverage']
model_ols = sm.OLS(y, X).fit()
pct_x_vars = model_ols.rsquared * 100

# Full model with firm and year FE
sic4_values = reg_data['sic4'].values
fyear_values = reg_data['fyear'].values
reg_data_panel = reg_data.set_index(['gvkey', 'fyear'])
y_panel = reg_data_panel['book_leverage']
X_panel = reg_data_panel[X_vars_baseline]


model_full = PanelOLS(y_panel, X_panel, entity_effects=True, time_effects=True).fit()
pct_full = model_full.rsquared * 100


# Residual variance (only calculate if pct_full is valid)
pct_residual = 100 - pct_full if not np.isnan(pct_full) else np.nan


print("VARIANCE DECOMPOSITION RESULTS")
print(f"\n{'Component':<40} {'% of Total Variance':>20}")
print("-" * 60)
print(f"{'Firm Fixed Effects (η_i)':<40} {pct_firm:>20.2f}%")
print(f"{'Year Fixed Effects':<40} {pct_year:>20.2f}%")
print(f"{'Industry Fixed Effects':<40} {pct_industry:>20.2f}%")
print(f"{'Industry×Year Interaction (ν_jt)':<40} {pct_ind_year:>20.2f}%")
print(f"{'Observable X Variables Only':<40} {pct_x_vars:>20.2f}%")
if not np.isnan(pct_full):
    print(f"{'Full Model (X + Firm FE + Year FE)':<40} {pct_full:>20.2f}%")
    print(f"{'Residual/Idiosyncratic (ε_ijt)':<40} {pct_residual:>20.2f}%")
else:
    print(f"{'Full Model (X + Firm FE + Year FE)':<40} {'N/A':>20}")
    print(f"{'Residual/Idiosyncratic (ε_ijt)':<40} {'N/A':>20}")
print("-" * 60)
print()

# Incremental R-squared analysis

print("INCREMENTAL R-SQUARED ANALYSIS")


# Model : X variables only
model1 = sm.OLS(y, X).fit()
r2_1 = model1.rsquared

# Model : X + Year FE
year_dummies = pd.get_dummies(reg_data['fyear'].astype(int), prefix='yr', drop_first=True, dtype=int)
X_year = pd.concat([X.reset_index(drop=True), year_dummies.reset_index(drop=True)], axis=1)
model2 = sm.OLS(y.reset_index(drop=True), X_year).fit()
r2_2 = model2.rsquared

# Model : X + Firm FE
try:
    model3 = PanelOLS(y_panel, X_panel, entity_effects=True, time_effects=False).fit()
    r2_3 = model3.rsquared
except Exception as e:
    print(f"Warning: Firm FE model error: {e}")
    r2_3 = np.nan

# Model 4 X + Firm + Year FE

model4 = PanelOLS(y_panel, X_panel, entity_effects=True, time_effects=True).fit()
r2_4 = model4.rsquared


print(f"\n{'Model':<50} {'R-squared':>15} {'Incremental':>15}")
print("-" * 80)
print(f"{'(1) X Variables Only':<50} {r2_1:>15.4f} {'-':>15}")
print(f"{'(2) X + Year FE':<50} {r2_2:>15.4f} {(r2_2 - r2_1):>15.4f}")
print(f"{'(3) X + Firm FE':<50} {r2_3:>15.4f} {(r2_3 - r2_1):>15.4f}")
incr = (r2_4 - r2_3) if not np.isnan(r2_3) else np.nan
incr_str = f"{incr:>15.4f}" if not np.isnan(incr) else f"{'N/A':>15}"
print(f"{'(4) X + Firm FE + Year FE':<50} {r2_4:>15.4f} {incr_str}")
print("-" * 80)




VARIANCE DECOMPOSITION ANALYSIS

Total Variance in Book Leverage: 0.041927

VARIANCE DECOMPOSITION RESULTS

Component                                 % of Total Variance
------------------------------------------------------------
Firm Fixed Effects (η_i)                                59.75%
Year Fixed Effects                                       1.68%
Industry Fixed Effects                                  14.29%
Industry×Year Interaction (ν_jt)                        24.62%
Observable X Variables Only                             18.07%
Full Model (X + Firm FE + Year FE)                       5.54%
Residual/Idiosyncratic (ε_ijt)                          94.46%
------------------------------------------------------------

INCREMENTAL R-SQUARED ANALYSIS

Model                                                    R-squared     Incremental
--------------------------------------------------------------------------------
(1) X Variables Only                                        0.1807    

### **Interpretation:**