In [24]:
import pandas_datareader.data as web #to collect data
import datetime as dt #to specify start and end dates

# import yfinance as yf

import eventstudy as es
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats
from scipy.spatial.distance import cdist


from sklearn.neighbors import NearestNeighbors

import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.rolling import RollingOLS

from patsy import dmatrices
from tqdm.notebook import tqdm
tqdm.pandas()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [25]:
dirFirm = pd.read_pickle("Director + Firm Level CAR_on_appointment_2 v090425.pkl")

In [26]:
# dirFirm data wrangling if any:
dirFirm["Appointment Year"] = [x.year for x in dirFirm["Appointment Date"]]

dirFirm = dirFirm.drop_duplicates(subset = ["Person Code", "Company", "Appointment Date"]).reset_index(drop = True)
dirFirm["ln_dirage"] = np.log(dirFirm["Age"] + 1).astype("float")
dirFirm["ln_directorships"] = np.log(dirFirm["CompCountOtherPastTotalAB"] + 1).astype("float")

# PSM

## Verifying and removing those rows with no control data points

In [27]:
# ln_tobinq_longborrowincl2 = Dependent Variable

 

# Control variables


# percentwomendir
# percentbusydir
# lnboardsize
# percentindep 
# promoters_percent
# nonpromoterinstitutions_percent
# ln_marcap
# debttoequityratio
# ln_firmage
# pb
# hasdualitychairmanmd
# ln_rdtoassets
# pb


# Sample constraints ---> govtdummy==0 & findummy==0 & asonyear>2012
dirFirm.columns.to_list()

['Symbol',
 'Company',
 'AsOnDate',
 'ISIN',
 'Person Code',
 'Director Salutation',
 'Director First Name',
 'Director Middle Name',
 'Director Surname',
 'Date of Birth',
 'Gender',
 'Nationality',
 'Member of Civil Services',
 'Promoter Director (Yes/No)',
 'Position on Board',
 'Independent (Yes/No)',
 'Education1',
 'Education2',
 'Education3',
 'Education4',
 'Education5',
 'Education6',
 'Education7',
 'Education8',
 'Education9',
 'Education10',
 'Skills/Competencies',
 'Occupation',
 'Cessation Reason',
 'Other Directorship 1',
 'Other Directorship 2',
 'Other Directorship 3',
 'Other Directorship 4',
 'Other Directorship 5',
 'Other Directorship 6',
 'Other Directorship 7',
 'Other Directorship 8',
 'Other Directorship 9',
 'Other Directorship 10',
 'Other Directorship 11',
 'Other Directorship 12',
 'Other Directorship 13',
 'Other Directorship 14',
 'Other Directorship 15',
 'Brief Profile',
 'Tenure Valid till',
 'Indep',
 'Appointment Date',
 'Cessation Date',
 'PrevLastS

In [28]:
controlVars = ["Promoters_percent", "NonpromoterInstitutions_percent",
               "ln_marcap",
               "HasFinanceXP", "HasTechXP", "HasRelatedIndustryXP",  "HasExecXP",
               "ln_dirage", "ln_directorships"]

dependentVar = "TobinQ_longborrowincl2"

dirFirm["HasDualityChairmanMD"] = dirFirm["HasDualityChairmanMD"].astype(int)


dirFirm["NIC_2digit"] = dirFirm["NIC code"].dropna().apply(lambda x: x[0:2])
dirFirm["NIC_2digit"] = dirFirm["NIC_2digit"]

psmSample = dirFirm.loc[ (dirFirm["Appointment Date"] >= "2013-03-31")\
& (dirFirm["govtdummy"] == 0) & (dirFirm["findummy"] == 0) ].copy()
#.dropna(subset = controlVars).dropna(subset = dependentVar).copy()

psmSample["DummySum"] = psmSample["IsRookie"] + psmSample["IsNonRookie"]
psmSample["DummySumIndep"] = psmSample["IsRookieIndep"] + psmSample["IsNonRookieIndep"]

psmSampleAll = psmSample.loc[ psmSample["DummySum"] == 1 ].reset_index(drop = True)

psmSampleIndep = psmSample.loc[ psmSample["DummySumIndep"] == 1 ].reset_index(drop = True)

In [38]:
# dirFirm2 = dirFirm.copy()
# psmSampleIndep2 = psmSampleIndep.copy()

# listCol = [
#     "FirstYearPCodeList", "TwoYearPCodeList", "ThreeYearPCodeList", "PCodeList",
#     "FirstYearIndepPCodeList", "TwoYearIndepPCodeList", "ThreeYearIndepPCodeList", "IndepPCodeList",
#     "OtherFirstYearIndepPCode", "OtherTwoYearIndepPCode", "OtherThreeYearIndepPCode", "TotalIndepPCode",
#     "OtherFirstYearPCode", "OtherTwoYearPCode", "OtherThreeYearPCode", "TotalPCode",
#     "OtherFirstYearPCodeIndepExcl","OtherTwoYearPCodeIndepExcl", "OtherThreeYearPCodeIndepExcl", "TotalPCodeIndepExcl",
#     "OtherFirstYearPCodeExcl", "OtherTwoYearPCodeExcl", "OtherThreeYearPCodeExcl", "TotalPCodeExcl"
# ]

# dirFirm2 = dirFirm2.drop(listCol, axis = 1)
# psmSampleIndep2 = psmSampleIndep2.drop(listCol, axis = 1)


# dirFirm2.to_csv("Main_Firm_PSM Ready_no filter v040425.csv")
# psmSampleIndep2.to_csv("Main_Firm_PSM Ready_filter-Indep_gov_fin v040425.csv")


# # # psmSampleAll --> 2101 rows 
# # psmSampleIndep --> 1561 rows 

## PSM --> RookieAppoints as Treatment, NonRookieAppoints as Control

In [87]:
def LogitReg(sample, endog_var, exog_var):
    
    # Logit Regression
    endog = sample[[endog_var]]
    exog = sample[exog_var]
    exog = sm.add_constant(exog)
    
    log_reg = sm.Logit(endog, exog).fit()

    propensityScores = log_reg.predict(exog)
    
    return propensityScores

In [88]:
def MeanDiffTtest(sample, endog_var, exog_var, car, depVar, dirFirm):

    dirFirm = dirFirm.rename( {depVar:f"{depVar}_2"}, axis = 1)

    colsAdd = []
    for i in range(-1, 4):
        if i != 0:
            colsAdd.append(f"AsOnYear_T+{i}")
            colsAdd.append(f"{depVar}T+{i}")
            if i>0 :
                colsAdd.append(depVar+f"(T+{i}) - (T-1)")

    newFrame= pd.DataFrame(columns = colsAdd, data = 0, index = sample.index, dtype = "int")
    sample = pd.concat([sample, newFrame], axis = 1)
    sample = sample.copy()
    
    for i in range(-1, 4):
        if i != 0:
            sample.loc[:, f"AsOnYear_T+{i}"] = sample["AsOnYear"] + i

    for i in range(-1, 4):
        if i != 0:
            sample.loc[:, f"{depVar}T+{i}"] = sample.merge(dirFirm[["Symbol", "AsOnYear", f"{depVar}_2"]].copy(), left_on = ["Symbol", f"AsOnYear_T+{i}"],
                                                          right_on = ["Symbol", "AsOnYear"], how = "left")[f"{depVar}_2"]
    
    for i in range(1, 4):
        if i != 0:
            sample.loc[:, depVar+f"(T+{i}) - (T-1)"] = sample[f"{depVar}T+{i}"] - sample[f"{depVar}T+-1"]

        
    sample = sample.copy()
    
    group1 = sample.loc[ sample[endog_var] == 1].copy()
    group2 = sample.loc[ sample[endog_var] == 0].copy()
    
    t_stat, p_value = stats.ttest_ind(group1[car], group2[car], equal_var=False)  # Welch’s t-test (default)

    print("\n")
    print(car, ":")
    print("\n")
    print("T Statistic:", t_stat, " P Value:",p_value)
    print("Treated Mean:", group1[car].mean(), " Control Mean:", group2[car].mean(), " Diff:", group1[car].mean() - group2[car].mean())
    print("Treated N:", len(group1[car]), "; Control N:", len(group2[car]))
    print("[treated unique = ", len(group1.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]",\
          "[control unique = ", len(group2.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]"
         )
    print("\n")

    print("━"*120)
    print(f'{"Matching Variable":<40} {"Treatment Firms":<20} {"Control Firms":<20} {"Test of Diff (p value)":<20}')
    print(f'{" ":<40} {"N = " + str(len(group1[car])):<20} {"N = " + str(len(group2[car])):<20}')
    print("-"*120)

    for var in exog_var:
        treatMean = group1[var].mean()
        controlMean = group2[var].mean()
        p_value = stats.ttest_ind(group1[var], group2[var], equal_var=False)[1]
        print(f'{var:<40} {treatMean:<20.4f} {controlMean:<20.4f} {p_value:<20.4f}')

    print("━"*120, "\n")
    print(depVar, " across years:\n")
    for i in range(1,4):
        sample = sample.dropna(subset = [depVar+f'(T+{i}) - (T-1)'])

    group1 = sample.loc[ sample[endog_var] == 1].copy()
    group2 = sample.loc[ sample[endog_var] == 0].copy()

    print("━"*150, "\n")
    # print(f'{depVar:<40}{" ":<20}{"Treatment Firms":<20}{"Control Firms":<20}{"Difference":<20}{"Test of Diff":<20}{"Test of Diff"}')
    # print(f'{" ":<120}{"(t stat)":<20}{"(p value)":<20}')

    # print("─"*150, "\n")

    # for i in range(1,4):
    #     t_stat2, p_value2 = stats.ttest_ind(group1[depVar+f'(T+{i}) - (T-1)'], group2[depVar+f'(T+{i}) - (T-1)'], equal_var=False)  # Welch’s t-test (default)
        
    #     treatedMean = group1[depVar+f'(T+{i}) - (T-1)'].mean()
    #     controlMean = group2[depVar+f'(T+{i}) - (T-1)'].mean()
    #     diffMean = treatedMean - controlMean

    #     treatedMedian = group1[depVar+f'(T+{i}) - (T-1)'].median()
    #     controlMedian = group2[depVar+f'(T+{i}) - (T-1)'].median()
    #     diffMedian = treatedMedian - controlMedian

    #     print(f'{"Year_T+" + str(i) +" - Year_T-1":<40}{"<MEAN>":<20}{treatedMean:<20.4f}{controlMean:<20.4f}{diffMean:<20.4f}{t_stat2:<20.4f}{p_value2:<20.10f}')

    #     label1 = "Treated N: " + str(len(group1[depVar+f'(T+{i}) - (T-1)']))
    #     label2 = "Control N: " + str(len(group1[depVar+f'(T+{i}) - (T-1)']))
        
    #     print(f'{label1 + " "*5 + label2:<40}{"<MEDIAN>":<20}{treatedMedian:<20.4f}{controlMedian:<20.4f}{diffMedian:<20.4f}')
        
    #     print("-"*150, "\n")
        
    # print("━"*150, "\n")

    return

In [89]:
def PsmReplac(sample, endog_var, exog_var, car, depVar, dirFirm):

    # Logit Regression
    sample.loc[:, "propensityScore"] = LogitReg(sample, endog_var, exog_var)

    treated = sample.loc[ sample[endog_var] == 1].copy()
    control = sample.loc[ sample[endog_var] == 0].copy()

    # Nearest Neighbours
    nn = NearestNeighbors(n_neighbors = 1, metric = "euclidean")
    nn.fit(control[["propensityScore"]])

    distances, indices = nn.kneighbors(treated[["propensityScore"]])
    
    matchedControl = control.iloc[indices.flatten()].copy()
    
    matched = pd.concat([treated, matchedControl])
    matched.reset_index(drop=True, inplace=True)

    MeanDiffTtest(matched, endog_var, exog_var, car, depVar, dirFirm)

    return

In [90]:
# Func PSM non replacement
def PsmNonReplac(sample, endog_var, exog_var, car, depVar, dirFirm):

    # Logit Regression
    sample.loc[:, "propensityScore"] = LogitReg(sample, endog_var, exog_var)

    # Separate treated and control groups
    treated = sample[sample[endog_var] == 1].copy()
    control = sample[sample[endog_var] == 0].copy()
    
    # Compute pairwise distances (absolute difference in propensity scores)
    dist_matrix = cdist(treated[['propensityScore']], control[['propensityScore']], metric='euclidean')
    
    # Match without replacement
    treated_indices = []
    matched_indices = []
    used_control_indices = set()
    
    for i in range(len(treated)):
        if len(used_control_indices) >= len(control):  # Stop if no controls left
            print("Warning: Not enough control units to match all treated units.")
            break
        
        # Get nearest control unit index that hasn't been used
        match_idx = np.argmin(dist_matrix[i])
        
        while match_idx in used_control_indices:  # Ensure it's not already matched
            dist_matrix[i, match_idx] = np.inf  # Temporarily set distance to infinity

            if np.all(dist_matrix[i] == np.inf):  # If all controls are exhausted
                print(f"No available control for treated unit {i}, skipping.")
                match_idx = None
                break
            
            match_idx = np.argmin(dist_matrix[i])
        
        used_control_indices.add(match_idx)
        matched_indices.append(match_idx)
        treated_indices.append(i)
    
    # Retrieve matched units
    matched_control = control.iloc[matched_indices].copy()
    matched_treated = treated.iloc[treated_indices].copy()
    
    # Combine matched treated and control units
    matched_data = pd.concat([matched_treated.reset_index(drop=True), matched_control.reset_index(drop=True)])
    
    # Reset index
    matched_data.reset_index(drop=True, inplace=True)


    # Mean difference and T Test
    MeanDiffTtest(matched_data, endog_var, exog_var, car, depVar, dirFirm)

    return
    


# Univariate T Tests

In [33]:
car = "120CAR3"
group1 = psmSampleIndep.loc[ psmSampleIndep["IsRookieIndep"] == 1].dropna(subset = car)
group2 = psmSampleIndep.loc[ psmSampleIndep["IsNonRookieIndep"] == 1].dropna(subset = car)


t_stat, p_value = stats.ttest_ind(group1[car], group2[car], equal_var=False)  # Welch’s t-test (default)

print("\n")
print(car, ":")
print("\n")
print("T Statistic:", t_stat, " P Value:",p_value)
print("Treated Mean:", group1[car].mean(), " Control Mean:", group2[car].mean(), " Diff:", group1[car].mean() - group2[car].mean())
print("Treated N:", len(group1[car]), "; Control N:", len(group2[car]))
print("[treated unique = ", len(group1.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]",\
      "[control unique = ", len(group2.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]")



120CAR3 :


T Statistic: 0.4200211802324427  P Value: 0.6744924556002441
Treated Mean: -0.0029838146886241737  Control Mean: -0.004445425869539107  Diff: 0.001461611180914933
Treated N: 5361 ; Control N: 2720
[treated unique =  5361 ] [control unique =  2720 ]


In [36]:
car = "120CAR3"
group1 = psmSampleIndep.loc[ psmSampleIndep["IsRookieIndep"] == 1].dropna(subset = car)


t_stat, p_value = stats.ttest_1samp(group1[car], 0)  # Welch’s t-test (default)

print("\n")
print(car, ":")
print("\n")
print("T Statistic:", t_stat, " P Value:",p_value)
print("Treated Mean:", group1[car].median(), " Control Mean:", group2[car].mean(), " Diff:", group1[car].mean() - group2[car].mean())
print("Treated N:", len(group1[car]), "; Control N:", len(group2[car]))
print("[treated unique = ", len(group1.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]",\
      "[control unique = ", len(group2.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]")



120CAR3 :


T Statistic: -1.977270987861567  P Value: 0.048062163964304655
Treated Mean: -0.0043988985285501  Control Mean: -0.004445425869539107  Diff: 0.001461611180914933
Treated N: 5361 ; Control N: 2720
[treated unique =  5361 ] [control unique =  2720 ]


In [37]:
car = "120CAR3"
group1 = psmSampleIndep.loc[ psmSampleIndep["IsNonRookieIndep"] == 1].dropna(subset = car)


t_stat, p_value = stats.ttest_1samp(group1[car], 0)  # Welch’s t-test (default)

print("\n")
print(car, ":")
print("\n")
print("T Statistic:", t_stat, " P Value:",p_value)
print("Treated Mean:", group1[car].median(), " Control Mean:", group2[car].mean(), " Diff:", group1[car].mean() - group2[car].mean())
print("Treated N:", len(group1[car]), "; Control N:", len(group2[car]))
print("[treated unique = ", len(group1.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]",\
      "[control unique = ", len(group2.loc[ :, ["Person Code", "Symbol", "AsOnDate"]].drop_duplicates()), "]")



120CAR3 :


T Statistic: -1.4177184475626528  P Value: 0.1563875674088713
Treated Mean: -0.00449132284210875  Control Mean: -0.004445425869539107  Diff: 0.0
Treated N: 2720 ; Control N: 2720
[treated unique =  2720 ] [control unique =  2720 ]
