In [1]:
import pandas_datareader.data as web #to collect data
import datetime as dt #to specify start and end dates

# import yfinance as yf

import eventstudy as es
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats
from scipy.spatial.distance import cdist


from sklearn.neighbors import NearestNeighbors

import pandas as pd

from linearmodels.model import PanelOLS
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.rolling import RollingOLS

from patsy import dmatrices
from tqdm.notebook import tqdm
tqdm.pandas()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
firmLev = pd.read_pickle('Main_Firm_Fin_Loc_COMPLETE v170325.pkl')

In [3]:
# FirmLev data wrangling if any:

# PSM

## Verifying and removing those rows with no control data points

In [4]:
# ln_tobinq_longborrowincl2 = Dependent Variable

 

# Control variables


# percentwomendir
# percentbusydir
# lnboardsize
# percentindep 
# promoters_percent
# nonpromoterinstitutions_percent
# ln_marcap
# debttoequityratio
# ln_firmage
# pb
# hasdualitychairmanmd
# ln_rdtoassets
# pb


# Sample constraints ---> govtdummy==0 & findummy==0 & asonyear>2012

In [6]:
controlVars = ["PercentWomenDir", "PercentBusyDir",
               "LnBoardSize", "PercentIndep",
               "Promoters_percent", "NonpromoterInstitutions_percent",
               "ln_marcap", "ln_rdtoassets", "Debt to equity ratio",
               "HasDualityChairmanMD"]

dependentVar = "ln_TobinQ_longborrowincl2"

firmLev["HasDualityChairmanMD"] = firmLev["HasDualityChairmanMD"].astype(int)


firmLev["NIC_2digit"] = firmLev["NIC code"].dropna().apply(lambda x: x[0:2])
firmLev["NIC_2digit"] = firmLev["NIC_2digit"].astype(float)

psmSample = firmLev.loc[ (firmLev["AsOnDate"] >= "2013-03-31")\
& (firmLev["govtdummy"] == 0) & (firmLev["findummy"] == 0) ].dropna(subset = controlVars).dropna(subset = dependentVar).copy()

psmSample["DummySum"] = psmSample["RookieAppointDummy"] + psmSample["NonRookieAppointDummy"]
psmSample["DummySumIndep"] = psmSample["RookieIndepAppointDummy"] + psmSample["NonRookieIndepAppointDummy"]

psmSampleAll = psmSample.loc[ psmSample["DummySum"] == 1 ].reset_index(drop = True)

psmSampleIndep = psmSample.loc[ psmSample["DummySumIndep"] == 1 ].reset_index(drop = True)

In [7]:
firmLev2 = firmLev.copy()
psmSampleIndep2 = psmSampleIndep.copy()

listCol = [
    "FirstYearPCodeList", "TwoYearPCodeList", "ThreeYearPCodeList", "PCodeList",
    "FirstYearIndepPCodeList", "TwoYearIndepPCodeList", "ThreeYearIndepPCodeList", "IndepPCodeList",
    "OtherFirstYearIndepPCode", "OtherTwoYearIndepPCode", "OtherThreeYearIndepPCode", "TotalIndepPCode",
    "OtherFirstYearPCode", "OtherTwoYearPCode", "OtherThreeYearPCode", "TotalPCode",
    "OtherFirstYearPCodeIndepExcl","OtherTwoYearPCodeIndepExcl", "OtherThreeYearPCodeIndepExcl", "TotalPCodeIndepExcl",
    "OtherFirstYearPCodeExcl", "OtherTwoYearPCodeExcl", "OtherThreeYearPCodeExcl", "TotalPCodeExcl"
]

firmLev2 = firmLev2.drop(listCol, axis = 1)
psmSampleIndep2 = psmSampleIndep2.drop(listCol, axis = 1)


firmLev2.to_csv("Main_Firm_PSM Ready_no filter v040425.csv")
psmSampleIndep2.to_csv("Main_Firm_PSM Ready_filter-Indep_gov_fin v040425.csv")


# # psmSampleAll --> 2101 rows 
# psmSampleIndep --> 1561 rows 

## PSM --> RookieAppoints as Treatment, NonRookieAppoints as Control

In [8]:
def LogitReg(sample, endog_var, exog_var):
    
    # Logit Regression
    endog = sample[[endog_var]]
    exog = sample[exog_var]
    exog = sm.add_constant(exog)
    
    log_reg = sm.Logit(endog, exog).fit()

    propensityScores = log_reg.predict(exog)
    
    return propensityScores

In [9]:
def MeanDiffTtest(sample, endog_var, exog_var, depVar, firmLev):

    firmLev = firmLev.rename( {depVar:f"{depVar}_2"}, axis = 1)

    colsAdd = []
    for i in range(-1, 4):
        if i != 0:
            colsAdd.append(f"AsOnYear_T+{i}")
            colsAdd.append(f"{depVar}T+{i}")
            if i>0 :
                colsAdd.append(depVar+f"(T+{i}) - (T-1)")

    newFrame= pd.DataFrame(columns = colsAdd, data = 0, index = sample.index, dtype = "int")
    sample = pd.concat([sample, newFrame], axis = 1)
    sample = sample.copy()
    
    for i in range(-1, 4):
        if i != 0:
            sample.loc[:, f"AsOnYear_T+{i}"] = sample["AsOnYear"] + i

    for i in range(-1, 4):
        if i != 0:
            sample.loc[:, f"{depVar}T+{i}"] = sample.merge(firmLev[["Symbol", "AsOnYear", f"{depVar}_2"]].copy(), left_on = ["Symbol", f"AsOnYear_T+{i}"],
                                                          right_on = ["Symbol", "AsOnYear"], how = "left")[f"{depVar}_2"]
    
    for i in range(1, 4):
        if i != 0:
            sample.loc[:, depVar+f"(T+{i}) - (T-1)"] = sample[f"{depVar}T+{i}"] - sample[f"{depVar}T+-1"]

        
    sample = sample.copy()
    
    group1 = sample.loc[ sample[endog_var] == 1].copy()
    group2 = sample.loc[ sample[endog_var] == 0].copy()
    
    t_stat, p_value = stats.ttest_ind(group1[depVar], group2[depVar], equal_var=False)  # Welch’s t-test (default)

    print("\n")
    print(depVar, ":")
    print("\n")
    print("T Statistic:", t_stat, " P Value:",p_value)
    print("Treated Mean:", group1[depVar].mean(), " Control Mean:", group2[depVar].mean(), " Diff:", group1[depVar].mean() - group2[depVar].mean())
    print("Treated N:", len(group1[depVar]), "; Control N:", len(group2[depVar]))
    print("[treated unique = ", len(group1.loc[ :, ["Symbol", "AsOnDate"]].drop_duplicates()), "]",\
          "[control unique = ", len(group2.loc[ :, ["Symbol", "AsOnDate"]].drop_duplicates()), "]"
         )
    print("\n")

    print("━"*120)
    print(f'{"Matching Variable":<40} {"Treatment Firms":<20} {"Control Firms":<20} {"Test of Diff (p value)":<20}')
    print(f'{" ":<40} {"N = " + str(len(group1[depVar])):<20} {"N = " + str(len(group2[depVar])):<20}')
    print("-"*120)

    for var in exog_var:
        treatMean = group1[var].mean()
        controlMean = group2[var].mean()
        p_value = stats.ttest_ind(group1[var], group2[var], equal_var=False)[1]
        print(f'{var:<40} {treatMean:<20.4f} {controlMean:<20.4f} {p_value:<20.4f}')

    print("━"*120, "\n")
    print(depVar, " across years:\n")
    for i in range(1,4):
        sample = sample.dropna(subset = [depVar+f'(T+{i}) - (T-1)'])

    group1 = sample.loc[ sample[endog_var] == 1].copy()
    group2 = sample.loc[ sample[endog_var] == 0].copy()

    print("━"*150, "\n")
    print(f'{depVar:<40}{" ":<20}{"Treatment Firms":<20}{"Control Firms":<20}{"Difference":<20}{"Test of Diff":<20}{"Test of Diff"}')
    print(f'{" ":<120}{"(t stat)":<20}{"(p value)":<20}')

    print("─"*150, "\n")

    for i in range(1,4):
        t_stat2, p_value2 = stats.ttest_ind(group1[depVar+f'(T+{i}) - (T-1)'], group2[depVar+f'(T+{i}) - (T-1)'], equal_var=False)  # Welch’s t-test (default)
        
        treatedMean = group1[depVar+f'(T+{i}) - (T-1)'].mean()
        controlMean = group2[depVar+f'(T+{i}) - (T-1)'].mean()
        diffMean = treatedMean - controlMean

        treatedMedian = group1[depVar+f'(T+{i}) - (T-1)'].median()
        controlMedian = group2[depVar+f'(T+{i}) - (T-1)'].median()
        diffMedian = treatedMedian - controlMedian

        print(f'{"Year_T+" + str(i) +" - Year_T-1":<40}{"<MEAN>":<20}{treatedMean:<20.4f}{controlMean:<20.4f}{diffMean:<20.4f}{t_stat2:<20.4f}{p_value2:<20.4f}')

        label1 = "Treated N: " + str(len(group1[depVar+f'(T+{i}) - (T-1)']))
        label2 = "Control N: " + str(len(group1[depVar+f'(T+{i}) - (T-1)']))
        
        print(f'{label1 + " "*5 + label2:<40}{"<MEDIAN>":<20}{treatedMedian:<20.4f}{controlMedian:<20.4f}{diffMedian:<20.4f}')
        
        print("-"*150, "\n")
        
    print("━"*150, "\n")

    return

In [10]:
def PsmReplac(sample, endog_var, exog_var, depVar, firmLev):

    # Logit Regression
    sample["propensityScore"] = LogitReg(sample, endog_var, exog_var)

    treated = sample.loc[ sample[endog_var] == 1].copy()
    control = sample.loc[ sample[endog_var] == 0].copy()

    # Nearest Neighbours
    nn = NearestNeighbors(n_neighbors = 1, metric = "euclidean")
    nn.fit(control[["propensityScore"]])

    distances, indices = nn.kneighbors(treated[["propensityScore"]])
    
    matchedControl = control.iloc[indices.flatten()].copy()
    
    matched = pd.concat([treated, matchedControl])
    matched.reset_index(drop=True, inplace=True)

    MeanDiffTtest(matched, endog_var, exog_var, depVar, firmLev)

    return

In [11]:
# Func PSM non replacement
def PsmNonReplac(sample, endog_var, exog_var, depVar, firmLev):

    # Logit Regression
    sample["propensityScore"] = LogitReg(sample, endog_var, exog_var)

    # Separate treated and control groups
    treated = sample[sample[endog_var] == 1].copy()
    control = sample[sample[endog_var] == 0].copy()
    
    # Compute pairwise distances (absolute difference in propensity scores)
    dist_matrix = cdist(treated[['propensityScore']], control[['propensityScore']], metric='euclidean')
    
    # Match without replacement
    treated_indices = []
    matched_indices = []
    used_control_indices = set()
    
    for i in range(len(treated)):
        if len(used_control_indices) >= len(control):  # Stop if no controls left
            print("Warning: Not enough control units to match all treated units.")
            break
        
        # Get nearest control unit index that hasn't been used
        match_idx = np.argmin(dist_matrix[i])
        
        while match_idx in used_control_indices:  # Ensure it's not already matched
            dist_matrix[i, match_idx] = np.inf  # Temporarily set distance to infinity

            if np.all(dist_matrix[i] == np.inf):  # If all controls are exhausted
                print(f"No available control for treated unit {i}, skipping.")
                match_idx = None
                break
            
            match_idx = np.argmin(dist_matrix[i])
        
        used_control_indices.add(match_idx)
        matched_indices.append(match_idx)
        treated_indices.append(i)
    
    # Retrieve matched units
    matched_control = control.iloc[matched_indices].copy()
    matched_treated = treated.iloc[treated_indices].copy()
    
    # Combine matched treated and control units
    matched_data = pd.concat([matched_treated.reset_index(drop=True), matched_control.reset_index(drop=True)])
    
    # Reset index
    matched_data.reset_index(drop=True, inplace=True)


    # Mean difference and T Test
    MeanDiffTtest(matched_data, endog_var, exog_var, depVar, firmLev)

    return
    


# Version 1: Check control variables

In [14]:
yearDummies = pd.get_dummies( psmSampleIndep["AsOnYear"], prefix = "YearDummy_", drop_first = True, dtype = "int")
psmSampleIndepWithYearDummies = pd.concat([psmSampleIndep, yearDummies], axis = 1)

industryDummies = pd.get_dummies( psmSampleIndepWithYearDummies["NIC_2digit"], prefix = "IndustryDummy_", drop_first = True, dtype = "int")
psmSampleIndepWithYearIndustryDummies = pd.concat([psmSampleIndepWithYearDummies, industryDummies], axis = 1)

## No fixed effects

### PSM with replacement

In [15]:
PsmReplac(psmSampleIndep, "RookieIndepAppointDummy", controlVars, "ln_TobinQ_longborrowincl2", firmLev)

Optimization terminated successfully.
         Current function value: 0.594094
         Iterations 5


ln_TobinQ_longborrowincl2 :


T Statistic: 1.6083067123837371  P Value: 0.10794443721610804
Treated Mean: 0.19787302769106344  Control Mean: 0.12234492597369394  Diff: 0.0755281017173695
Treated N: 897 ; Control N: 897
[treated unique =  897 ] [control unique =  351 ]


━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Matching Variable                        Treatment Firms      Control Firms        Test of Diff (p value)
                                         N = 897              N = 897             
------------------------------------------------------------------------------------------------------------------------
PercentWomenDir                          0.1456               0.1471               0.6960              
PercentBusyDir                           0.3018               0.3025               0.9642 

In [16]:
### PSM without replacement
# PsmNonReplac(psmSampleIndep, "RookieIndepAppointDummy", controlVars, "ln_TobinQ_longborrowincl2", firmLev)

## Using Year fixed effects

### PSM with replacement

In [17]:
PsmReplac(psmSampleIndepWithYearDummies, "RookieIndepAppointDummy", controlVars + yearDummies.columns.to_list(), "ln_TobinQ_longborrowincl2", firmLev)

Optimization terminated successfully.
         Current function value: 0.583797
         Iterations 5


ln_TobinQ_longborrowincl2 :


T Statistic: 0.9012271532817028  P Value: 0.367588662893807
Treated Mean: 0.19787302769106344  Control Mean: 0.15464934609801267  Diff: 0.04322368159305076
Treated N: 897 ; Control N: 897
[treated unique =  897 ] [control unique =  336 ]


━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Matching Variable                        Treatment Firms      Control Firms        Test of Diff (p value)
                                         N = 897              N = 897             
------------------------------------------------------------------------------------------------------------------------
PercentWomenDir                          0.1456               0.1496               0.3135              
PercentBusyDir                           0.3018               0.3034               0.9123  

## Using Year + Industry fixed effects

### PSM with replacement

In [18]:
PsmReplac(psmSampleIndepWithYearIndustryDummies, "RookieIndepAppointDummy", controlVars + yearDummies.columns.to_list() + industryDummies.columns.to_list(), "ln_TobinQ_longborrowincl2", firmLev)

         Current function value: 0.569062
         Iterations: 35


ln_TobinQ_longborrowincl2 :


T Statistic: 1.566050415232797  P Value: 0.11751352778514781
Treated Mean: 0.19787302769106344  Control Mean: 0.12178324510053559  Diff: 0.07608978259052784
Treated N: 897 ; Control N: 897
[treated unique =  897 ] [control unique =  339 ]


━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Matching Variable                        Treatment Firms      Control Firms        Test of Diff (p value)
                                         N = 897              N = 897             
------------------------------------------------------------------------------------------------------------------------
PercentWomenDir                          0.1456               0.1459               0.9494              
PercentBusyDir                           0.3018               0.3110               0.5208              
LnBoardSize           

