In [6]:
# ========= 1) Load & light clean =========
import re
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

data_path   = r"C:/Users/ZITIAO/Desktop/merged_for_regression.csv"
export_path = r"C:/Users/ZITIAO/Desktop/industry_regression_summary.csv"

df = pd.read_csv(data_path)

norm = {c: re.sub(r'\s+', ' ', str(c)).strip() for c in df.columns}
df.rename(columns=norm, inplace=True)

def find_col(cands):
    cands = [re.sub(r'\s+',' ', s).strip().lower() for s in cands]
    for c in df.columns:
        if re.sub(r'\s+',' ', c).strip().lower() in cands:
            return c
    return None

col_emis  = find_col(['total emission (kg)'])
col_emp   = find_col(['employee'])
col_turnM = find_col(['turnover_million'])
col_turn  = find_col(['turnover'])

col_sector    = find_col(['sector','regulated industry sector'])
col_subsector = find_col(['subsector','regulated industry sub sector'])

need = {'Total Emission (kg)':col_emis, 'Employee':col_emp}
if not col_turnM and not col_turn:
    raise KeyError("cannot find turnover_million")
for k,v in need.items():
    if v is None:
        raise KeyError(f"missing column：{k}")

col_turn_used = col_turnM if col_turnM else col_turn

if col_sector:
    df['Sector'] = df[col_sector].astype('category')
if col_subsector:
    df['SubSector'] = df[col_subsector].astype('category')

def to_log_pos(s):
    s_num = pd.to_numeric(s, errors='coerce')
    s_num = s_num.where(s_num > 0)
    return np.log(s_num)

df['log_emission'] = to_log_pos(df[col_emis])
df['log_employee'] = to_log_pos(df[col_emp])
df['log_turnover'] = to_log_pos(df[col_turn_used])

needed_vars = ['log_emission','log_turnover','log_employee']
if 'Sector' in df.columns:    needed_vars.append('Sector')
if 'SubSector' in df.columns: needed_vars.append('SubSector')

clean = df.replace([np.inf,-np.inf], np.nan).dropna(subset=needed_vars)

print(len(clean))
print(clean[needed_vars].head(3))


260
   log_emission  log_turnover  log_employee
0      0.139762      7.077498      8.396606
1      7.422015      7.077498      8.396606
2      0.463734      7.077498      8.396606


In [8]:
formulas = {
    'Base'      : 'log_emission ~ log_turnover + log_employee',
    'Sector'    : 'log_emission ~ log_turnover + log_employee + C(Sector)',
    'SubSector' : 'log_emission ~ log_turnover + log_employee + C(SubSector)',
    'Both'      : 'log_emission ~ log_turnover + log_employee + C(Sector) + C(SubSector)'
}

usable = {}
for name, fml in formulas.items():
    if ('C(Sector)' in fml and 'Sector' not in clean.columns): 
        continue
    if ('C(SubSector)' in fml and 'SubSector' not in clean.columns):
        continue
    usable[name] = fml

print("Estimate model：", list(usable.keys()))

rows = []
for name, fml in usable.items():
    print("\n" + "="*10, f"{name} Regression", "="*10)
    model = smf.ols(fml, data=clean).fit()
    print(model.summary())

    coefs = model.params
    ses   = model.bse
    tvals = model.tvalues
    pvals = model.pvalues
    for term in coefs.index:
        rows.append({
            'model': name,
            'term': term,
            'coef': coefs[term],
            'std_err': ses[term],
            't': tvals[term],
            'p': pvals[term],
            'nobs': model.nobs,
            'r2': model.rsquared
        })

summary_df = pd.DataFrame(rows)
summary_df.to_csv(export_path, index=False)
summary_df.head(10)


Estimate model： ['Base']

                            OLS Regression Results                            
Dep. Variable:           log_emission   R-squared:                       0.328
Model:                            OLS   Adj. R-squared:                  0.323
Method:                 Least Squares   F-statistic:                     62.67
Date:                Thu, 28 Aug 2025   Prob (F-statistic):           6.78e-23
Time:                        21:53:59   Log-Likelihood:                -635.84
No. Observations:                 260   AIC:                             1278.
Df Residuals:                     257   BIC:                             1288.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -5.270

Unnamed: 0,model,term,coef,std_err,t,p,nobs,r2
0,Base,Intercept,-5.270144,0.978155,-5.387841,1.611866e-07,260.0,0.327831
1,Base,log_turnover,0.804695,0.203511,3.954066,9.939493e-05,260.0,0.327831
2,Base,log_employee,0.376692,0.250532,1.50357,0.1339199,260.0,0.327831


In [10]:
pivot = (summary_df
         .query("term in ['log_turnover','log_employee']")
         .pivot(index='term', columns='model', values='coef'))
print(pivot.round(4))


model           Base
term                
log_employee  0.3767
log_turnover  0.8047


In [12]:
brief = (summary_df
         .query("term in ['Intercept','log_turnover','log_employee']")
         .copy())
brief.to_csv(r"C:/Users/ZITIAO/Desktop/industry_regression_brief.csv", index=False)
print("save：industry_regression_brief.csv")


save：industry_regression_brief.csv


In [14]:
import re
import numpy as np
import pandas as pd
from linearmodels.panel import PanelOLS

src = df.copy()
src.columns = [re.sub(r'\s+', ' ', str(c)).strip() for c in src.columns]

def find_col(name_options):
    canon = {re.sub(r'\s+',' ', c).strip().lower(): c for c in src.columns}
    for opt in name_options:
        key = re.sub(r'\s+',' ', opt).strip().lower()
        if key in canon: 
            return canon[key]
    return None

col_firm   = find_col(["OPERATOR NAME","operator name","company","company name"])
col_year   = find_col(["Year","year"])
col_emis   = find_col(["Total Emission (kg)","total emission (kg)","emission"])
col_emp    = find_col(["Employee","Employees"])
col_turn_m = find_col(["Turnover_million","turnover_million"])
col_turn   = find_col(["Turnover","turnover"])
col_sector = find_col(["REGULATED INDUSTRY SECTOR","Sector","sector"])
col_subsec = find_col(["REGULATED INDUSTRY SUB SECTOR","SubSector","subsector"])

if not all([col_firm, col_year, col_emis, col_emp]) or (not col_turn_m and not col_turn):
    raise KeyError

turn_used = col_turn_m if col_turn_m else col_turn

def first_valid(s):
    s = s.dropna()
    return s.iloc[0] if len(s) else np.nan

agg = (src
       .groupby([col_firm, col_year], as_index=False)
       .agg({
           col_emis: 'sum',
           col_emp: first_valid,
           turn_used: first_valid,
           **({col_sector: first_valid} if col_sector else {}),
           **({col_subsec: first_valid} if col_subsec else {}),
       }))

def safe_log(x):
    x = pd.to_numeric(x, errors="coerce")
    x = x.where(x > 0)
    return np.log(x)

agg = agg.rename(columns={col_firm:'firm', col_year:'year',
                          col_emis:'emission_kg', col_emp:'employee',
                          turn_used:'turnover_used',
                          **({col_sector:'Sector'} if col_sector else {}),
                          **({col_subsec:'SubSector'} if col_subsec else {})})
with np.errstate(all='ignore'):
    agg['year'] = pd.to_numeric(agg['year'], errors='coerce').astype('Int64')

agg['log_emission'] = safe_log(agg['emission_kg'])
agg['log_employee'] = safe_log(agg['employee'])
agg['log_turnover'] = safe_log(agg['turnover_used'])

keep_cols = ['firm','year','log_emission','log_employee','log_turnover']
if 'Sector' in agg.columns:    keep_cols.append('Sector')
if 'SubSector' in agg.columns: keep_cols.append('SubSector')

panel = (agg[keep_cols]
         .replace([np.inf, -np.inf], np.nan)
         .dropna(subset=['firm','year','log_emission','log_employee','log_turnover']))

if 'Sector' in panel.columns:    panel['Sector'] = panel['Sector'].astype('category')
if 'SubSector' in panel.columns: panel['SubSector'] = panel['SubSector'].astype('category')

panel = panel.set_index(['firm','year']).sort_index()

print(f"Panel shape: {panel.shape}, firms={panel.index.levels[0].size}, years≈{panel.index.levels[1].size}")
print(panel[['log_emission','log_turnover','log_employee']].head())

# Model A
mod_A = PanelOLS.from_formula(
    "log_emission ~ 1 + log_turnover + log_employee + EntityEffects + TimeEffects",
    data=panel
)
res_A = mod_A.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
print("\n" + "="*80)
print("Model A — Two-way FE (firm + year)")
print("="*80)
print(res_A.summary)

# Model B: Sector dummies
res_B = None
if 'Sector' in panel.columns:
    d_sec = pd.get_dummies(panel['Sector'], prefix='Sector', drop_first=True)
    X_B = pd.concat([panel[['log_turnover','log_employee']], d_sec], axis=1)
    data_B = pd.concat([panel[['log_emission']], X_B], axis=1).dropna()
    mod_B = PanelOLS(
        dependent=data_B['log_emission'],
        exog=data_B.drop(columns=['log_emission']),
        time_effects=True
    )
    res_B = mod_B.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
    print("\n" + "="*80)
    print("Model B — Sector dummies + Year FE (no firm FE)")
    print("="*80)
    print(res_B.summary)


# Model C: SubSector dummies 
res_C = None
if 'SubSector' in panel.columns:
    d_sub = pd.get_dummies(panel['SubSector'], prefix='SubSector', drop_first=True)
    X_C = pd.concat([panel[['log_turnover','log_employee']], d_sub], axis=1)
    data_C = pd.concat([panel[['log_emission']], X_C], axis=1).dropna()
    mod_C = PanelOLS(
        dependent=data_C['log_emission'],
        exog=data_C.drop(columns=['log_emission']),
        time_effects=True
    )
    res_C = mod_C.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
    print("\n" + "="*80)
    print("Model C — SubSector dummies + Year FE (no firm FE)")
    print("="*80)
    print(res_C.summary)

rows = []
def collect(res, name):
    if res is None: return
    params = res.params
    pvals  = res.pvalues
    tvals  = res.tstats
    rsq_w  = getattr(res.rsquared, 'within',  np.nan)
    rsq_o  = getattr(res.rsquared, 'overall', np.nan)
    for k in params.index:
        rows.append({
            'model': name, 'term': k,
            'coef': params[k],
            't': tvals.get(k, np.nan),
            'p': pvals.get(k, np.nan),
            'R2_within': rsq_w, 'R2_overall': rsq_o,
            'nobs': res.nobs
        })

collect(res_A, "Two-way FE")
collect(res_B, "Sector + Year FE")
collect(res_C, "SubSector + Year FE")

summary = pd.DataFrame(rows)
print("\nTidy summary (head):")
print(summary.head(10))

summary.to_csv(r"C:/Users/ZITIAO/Desktop/panel_ols_summary.csv", index=False)


Panel shape: (125, 3), firms=26, years≈8
                    log_emission  log_turnover  log_employee
firm          year                                          
anglian water 2016      7.427275      7.077498      8.396606
              2017      7.172348      7.112327      8.403352
              2018      6.940997      7.130018      8.433812
              2019      6.945388      7.211557      8.468843
              2020      7.097780      7.258412      8.513988

Model A — Two-way FE (firm + year)
                          PanelOLS Estimation Summary                           
Dep. Variable:           log_emission   R-squared:                        0.0375
Estimator:                   PanelOLS   R-squared (Between):              0.5402
No. Observations:                 125   R-squared (Within):               0.0133
Date:                Thu, Aug 28 2025   R-squared (Overall):              0.5676
Time:                        21:56:52   Log-likelihood                   -119.43
Cov. Estim