In [1]:
import pandas as pd
import re
import numpy as np

#### Reading .txts

In [2]:
ruta = "lob_lobbying.txt"

# Forcing reading by ignoring quotation marks to avoid parsing errors
df_raw = pd.read_csv(
    ruta,
    sep="|",
    engine="python",
    header=None,
    quoting=3,               
    encoding="latin1",        
    on_bad_lines='skip'
)

# Cleaning
df_clean = df_raw.dropna(axis=1, how="all")
df_clean = df_clean.loc[:, ~df_clean.columns.duplicated()]

print("Columns:", df_clean.shape[1])
print(df_clean.head(3).T)

Columns: 32
                                       0  \
0                                    NaN   
1   82c5f661-a637-45ad-a3a6-b5ba18cf8962   
2                                      ,   
3         ASTRAZENECA PHARMACEUTICALS LP   
4                                      ,   
5            AstraZeneca Pharmaceuticals   
6                                     ,,   
7         ASTRAZENECA PHARMACEUTICALS LP   
8                                      ,   
9            AstraZeneca Pharmaceuticals   
10                                     ,   
11                       AstraZeneca PLC   
12                           ,1370000.0,   
13                                 H4300   
14                                     ,   
15                                 pac     
16                                     ,   
17                                     x   
18                                     ,   
19                                         
20                                     ,   
21                  

In [3]:
ruta = "lob_lobbying.txt"

# Reading documents
df_raw = pd.read_csv(
    ruta,
    sep="|",
    engine="python",
    header=None,
    quoting=3,
    encoding="latin1",
    on_bad_lines='skip'
)

# Extracting just needed columns
col_client = 3
col_amount = 12
col_year = 25
col_quarter = 27

df = df_raw[[col_client, col_amount, col_year, col_quarter]].copy()
df.columns = ["client_name", "amount_raw", "year", "quarter"]
df["client_name"] = df["client_name"].astype(str).str.strip()

# Cleagning usd, year and Q columns
df["amount_usd"] = df["amount_raw"].astype(str).str.replace(",", "").str.strip()
df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce")
df["year"] = pd.to_numeric(df["year"].astype(str).str.extract(r"(\d{4})")[0], errors="coerce")
df["quarter"] = df["quarter"].astype(str).str.lower().str.extract(r"q([1-4])", expand=False)
df["quarter"] = pd.to_numeric(df["quarter"], errors="coerce")

# Cleaning NAs
df = df.dropna(subset=["client_name", "amount_usd", "year", "quarter"])

# year and Q as int
df["year"] = df["year"].astype(int)
df["quarter"] = df["quarter"].astype(int)

print(df.head())

#df.to_csv("lobbying_limpio.csv", index=False)
#print("Saved as lobbying_limpio.csv")
df["client_name"].nunique()

                                         client_name   amount_raw  year  \
0                     ASTRAZENECA PHARMACEUTICALS LP  ,1370000.0,  2021   
1               WTA -- ADVOCATES FOR RURAL BROADBAND    ,75000.0,  2021   
2                 FINANCIAL EXECUTIVES INTERNATIONAL    ,21650.0,  2021   
3                              WATEREUSE ASSOCIATION    ,30000.0,  2021   
4  ACADEMY OF NUTRITION AND DIETETICS (FORMERLY A...        ,0.0,  2021   

   quarter  amount_usd  
0        4   1370000.0  
1        4     75000.0  
2        4     21650.0  
3        4     30000.0  
4        4         0.0  


8891

In [8]:
# Uploading Keys

keys = pd.read_csv("all_firms_keys.csv", dtype=str)
name_col = "company_name" if "company_name" in keys.columns else "conm"
keys["company_name_clean"] = keys[name_col].str.upper()

# Cleaning Client Name from lobby file
def clean_name(s):
    if pd.isna(s): return ""
    s = s.upper()
    s = re.sub(r'\b(CORP(ORATION)?|INC(ORPORATED)?|LTD|LLC|CO|S\.A\.|LP|PLC|LIMITED|THE)\b', '', s)
    s = re.sub(r'[^A-Z0-9& ]+', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df["client_clean"] = df["client_name"].map(clean_name)
keys["company_name_clean"] = keys["company_name_clean"].map(clean_name)

# Merge by excat name (cleaned name)
df_matched = df.merge(keys[["gvkey", "company_name_clean"]], how="left",
                      left_on="client_clean", right_on="company_name_clean")

print("With match:", df_matched["gvkey"].notna().sum())
print("No match:", df_matched["gvkey"].isna().sum())

# Saving no matched for manual analysis 
no_match = df_matched[df_matched["gvkey"].isna()][["client_name", "client_clean"]].drop_duplicates()
no_match.to_csv("lobbying_clients_unmatched.csv", index=False)
print("Saved lobbying_clients_unmatched.csv")

# ---------------------------
# Final Lobby File with GVKEYS before Fuzzymatch
# ---------------------------
df_final = (df_matched.dropna(subset=["gvkey"])
                     .groupby(["gvkey", "year", "quarter"], as_index=False)["amount_usd"]
                     .sum()
                     .rename(columns={"amount_usd": "lobby_usd"}))
final_csv = df_final
final_csv.to_csv("Lobby_final_gvkeys.csv", index = False)

With match: 23309
No match: 155486
Saved lobbying_clients_unmatched.csv


#### Fuzzymatching: done in local Jupyter due to limitations in Durham hub

!pip install fuzzywuzzy[speedup]
!pip install python-Levenshtein
!pip install pyarrow

import sys
!{sys.executable} -m pip install --user rapidfuzz

import pandas as pd
from rapidfuzz import process, fuzz
from rapidfuzz.fuzz import token_sort_ratio
import time
import re

##### Fuzzy matching
df = pd.read_csv("lobbying_limpio.csv", dtype=str)
keys = pd.read_csv("all_firms_keys.csv", dtype=str)

def clean_name(s):
    if pd.isna(s): return ""
    s = s.upper()
    s = re.sub(r'\b(CORP(ORATION)?|INC(ORPORATED)?|LTD|LLC|CO|S\.A\.|LP|PLC|LIMITED|THE|COMPANY)\b', '', s)
    s = re.sub(r'[^A-Z0-9& ]+', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df["client_clean"] = df["client_name"].map(clean_name)
keys["company_name_clean"] = keys["company_name"].map(clean_name)

unique_clients = df["client_clean"].dropna().unique()
company_names = keys["company_name_clean"].dropna().unique()

matches = []
start = time.time()

for i, c in enumerate(unique_clients):
    result = process.extract(c, company_names, scorer=token_sort_ratio, limit=2)
    best_match, score = result[0][0], result[0][1]
    second_score = result[1][1] if len(result) > 1 else 0
    confidence_gap = score - second_score

    matches.append({
        "client_clean": c,
        "best_match": best_match,
        "score": score,
        "confidence_gap": confidence_gap
    })

    if i % 100 == 0:
        print(f"Completed {i} of {len(unique_clients)}")

print("Time:", round(time.time() - start, 2), "secs")

match_df = pd.DataFrame(matches)

##### Merge with gvkey
match_df = match_df.merge(keys[["company_name_clean", "gvkey"]],
                          left_on="best_match", right_on="company_name_clean",
                          how="left")

match_df.to_csv("match_fuzzy_all_clients.csv", index=False)
print("File saved as match_fuzzy_all_clients.csv")


#### Matching k > 90 score

In [10]:
# Loading Lobby file
df = pd.read_csv("lobbying_limpio.csv", dtype=str)

# Client clean
def clean_name(s):
    if pd.isna(s): return ""
    s = s.upper()
    s = re.sub(r'\b(CORP(ORATION)?|INC(ORPORATED)?|LTD|LLC|CO|S\.A\.|LP|PLC|LIMITED|THE)\b', '', s)
    s = re.sub(r'[^A-Z0-9& ]+', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df["client_clean"] = df["client_name"].map(clean_name).str.strip()

# Numeric columns
df["amount_usd"] = pd.to_numeric(df["amount_usd"])
df["year"] = pd.to_numeric(df["year"])
df["quarter"] = pd.to_numeric(df["quarter"])

# Loading Fuzzy matching file
matches = pd.read_csv("match_fuzzy_all_clients.csv", dtype=str)
matches["score"] = pd.to_numeric(matches["score"])
matches["client_clean"] = matches["client_clean"].str.strip()

# Filtering matches above 50% confidence
matches_confident = matches[matches["score"] >= 51].copy()

# Merge
df_matched = df.merge(matches_confident[["client_clean", "gvkey"]], on="client_clean", how="left")
df_matched = df_matched.dropna(subset=["gvkey"])
df_matched["gvkey"] = df_matched["gvkey"].astype(str)

df_final = (
    df_matched.groupby(["gvkey", "year", "quarter"], as_index=False)["amount_usd"] # Adding lobby GVKEY and Q
    .sum()
    .rename(columns={"amount_usd": "lobby_usd"})
)

# log, winsor, z-score
df_final["log1p_lobby"] = np.log1p(df_final["lobby_usd"])

from scipy.stats.mstats import winsorize
df_final["log1p_lobby_w"] = winsorize(df_final["log1p_lobby"], limits=[0.01, 0.01])

df_final["log1p_lobby_std_firm"] = (
    df_final.groupby("gvkey")["log1p_lobby_w"]
    .transform(lambda x: (x - x.mean()) / x.std(ddof=0))
)

df_final.to_csv("Lobby_final_gvkeys_fuzzy.csv", index=False)
print("File saved: Lobby_final_gvkeys_fuzzy.csv")

print (df_final["gvkey"].nunique())

File saved: Lobby_final_gvkeys_fuzzy.csv
3068


In [11]:
# Checking firms with Lobby and in Final Panel
lobby = pd.read_csv("Lobby_final_gvkeys_fuzzy.csv")
lobby["gvkey"] = lobby["gvkey"].astype(str)

panel = pd.read_csv("panel_volatility_with_assets_2002_2025.csv")
panel["gvkey"] = panel["gvkey"].astype(str)

# Intersection
column_intersection = set(lobby["gvkey"]).intersection(set(panel["gvkey"]))
print("Firms with lobbying and in final panel:", len(column_intersection))


Firms with lobbying and in final panel: 1494


  panel = pd.read_csv("panel_volatility_with_assets_2002_2025.csv")


#### Merge final Panel Volatility + Lobbying

In [13]:
panel = pd.read_csv("panel_volatility_with_assets_2002_2025.csv", low_memory=False)

# Formatting columns in Panel and Lobby
panel["year"] = panel["quarter"].str.extract(r"(\d{4})").astype("Int64")
panel["quarter_num"] = panel["quarter"].str.extract(r"Q([1-4])").astype("Int64")

panel = panel.drop(columns=["quarter"])

panel = panel.rename(columns={"quarter_num": "quarter"})

lobby = pd.read_csv("Lobby_final_gvkeys_fuzzy.csv")
panel["gvkey"] = panel["gvkey"].astype(str)
lobby["gvkey"] = lobby["gvkey"].astype(str)

panel["year"] = panel["year"].astype("Int64")
panel["quarter"] = panel["quarter"].astype("Int64")
lobby["year"] = lobby["year"].astype("Int64")
lobby["quarter"] = lobby["quarter"].astype("Int64")

# Merge
panel_merged = panel.merge(lobby, on=["gvkey", "year", "quarter"], how="left")

# Creating variables with lobbying
panel_merged["lobby_usd"] = pd.to_numeric(panel_merged["lobby_usd"], errors="coerce")
panel_merged["log_lobby_usd"] = np.log1p(panel_merged["lobby_usd"])
panel_merged["has_lobbying"] = panel_merged["lobby_usd"].notna().astype(int)

# Result
panel_merged.to_csv("panel_with_lobbying.csv", index=False)
print("Obs with lobbying:", panel_merged["has_lobbying"].sum())
print("Unique firms with lobbying:", panel_merged.loc[panel_merged["has_lobbying"]==1, "gvkey"].nunique())


Obs with lobbying: 25432
Unique firms with lobbying: 1108
