In [5]:

# TASK 3: Transaction Management

import re
import duckdb
import pandas as pd
from IPython.display import display
try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

def clean_col(c: str) -> str:
    c = c.strip()
    c = c.replace('%', '_PCT')
    c = re.sub(r'[^0-9A-Za-z_]', '_', c)
    c = re.sub(r'_+', '_', c)
    return c.upper()

def load_barra_rsk(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, skiprows=1, sep=",", quotechar='"', low_memory=False)
    df.columns = [clean_col(c) for c in df.columns]
    if "BETA" not in df.columns and "HBTA" in df.columns:
        df["BETA"] = df["HBTA"]
    for col in ["BETA", "HBTA", "PRICE"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    if "IND1" in df.columns:
        df["IND1"] = pd.to_numeric(df["IND1"], errors="coerce").astype("Int64")
    return df

def safe_contains(d: dict, key: str) -> bool:
    return any(k.lower() == key.lower() for k in d.keys())

if IN_COLAB:
    print("⬆️ Please upload: USE3L0712.RSK, USE3L0812.RSK (and optionally Industry_to_Sector_Mapping_CLEAN.csv)")
    uploaded = files.upload()
    have_map = safe_contains(uploaded, "Industry_to_Sector_Mapping_CLEAN.csv")
else:
    uploaded = {}
    have_map = False

def get_name(possible):
    for k in uploaded.keys():
        if k.lower() == possible.lower():
            return k
    return possible

f2007 = get_name("USE3L0712.RSK")
f2008 = get_name("USE3L0812.RSK")
map_csv = get_name("Industry_to_Sector_Mapping_CLEAN.csv")

df2007 = load_barra_rsk(f2007)
df2008 = load_barra_rsk(f2008)

needed_2007 = {"BARRID", "NAME", "IND1", "BETA"}
missing_2007 = [c for c in needed_2007 if c not in df2007.columns]
if missing_2007:
    raise RuntimeError(f"2007 file missing required columns: {missing_2007}. Found: {list(df2007.columns)[:25]} ...")

print("2007 rows:", len(df2007), " | columns:", len(df2007.columns))
print("2008 rows:", len(df2008), " | columns:", len(df2008.columns))

df_map = None
if have_map:
    df_map = pd.read_csv(map_csv)
    df_map.columns = [clean_col(c) for c in df_map.columns]
    if "US_E3_INDUSTRY_CODE" in df_map.columns:
        df_map["US_E3_INDUSTRY_CODE"] = pd.to_numeric(df_map["US_E3_INDUSTRY_CODE"], errors="coerce").astype("Int64")
    else:
        print(" Mapping file missing US_E3_INDUSTRY_CODE; sector names will be skipped.")
        df_map = None

con = duckdb.connect(database=":memory:")
con.register("df2007", df2007)
con.register("df2008", df2008)
con.execute("CREATE OR REPLACE TABLE barra2007 AS SELECT * FROM df2007;")
con.execute("CREATE OR REPLACE TABLE barra2008 AS SELECT * FROM df2008;")
if df_map is not None:
    con.register("df_map", df_map)
    con.execute("CREATE OR REPLACE TABLE sector_map AS SELECT * FROM df_map;")

print("\nDatabase connected & tables created.")
display(con.execute("PRAGMA table_info('barra2007')").df().head(20))

targets = con.execute("""
    SELECT BARRID, NAME, BETA, IND1
    FROM barra2007
    WHERE NAME ILIKE '%BEAR%' AND NAME ILIKE '%STEARNS%'
       OR NAME ILIKE '%LEHMAN%'
""").df()

print("\n== 3A: Identified target rows ==")
display(targets)
targets.to_csv("task3_targets.csv", index=False)

if targets.empty:
    print("No matching rows for Bear Stearns / Lehman found in 2007 file. The demo will still run but changes will be zero.")

con.execute("""
    CREATE OR REPLACE TABLE backup_bears AS
    SELECT * FROM barra2007
    WHERE NAME ILIKE '%BEAR%' AND NAME ILIKE '%STEARNS%'
       OR NAME ILIKE '%LEHMAN%';
""")
backup_rows = con.execute("SELECT BARRID, NAME, BETA, IND1 FROM backup_bears;").df()
print("\n== 3B: Backup rows saved ==")
display(backup_rows)
backup_rows.to_csv("task3_backup_rows.csv", index=False)

baseline = con.execute("SELECT COUNT(*) AS n, ROUND(AVG(BETA),6) AS avg_beta FROM barra2007;").df()
print("\n== 3C: Baseline avg beta (pre-update):", float(baseline['avg_beta'].iloc[0]))

con.execute("""
    CREATE OR REPLACE TABLE baseline_sector AS
    SELECT IND1, AVG(BETA) AS avg_beta_2007
    FROM barra2007
    GROUP BY IND1;
""")
baseline_sector = con.execute("SELECT * FROM baseline_sector ORDER BY IND1;").df()
print("\nSaved baseline sector averages (by IND1).")
display(baseline_sector.head(10))
baseline_sector.to_csv("task3_baseline_sector.csv", index=False)

print("\n=== 3D: TRANSACTION DEMO (ROLLBACK) ===")
con.execute("BEGIN TRANSACTION;")
con.execute("""
    UPDATE barra2007
    SET BETA = 5.0
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""")

post_rows = con.execute("""
    SELECT BARRID, NAME, BETA, IND1
    FROM barra2007
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""").df()
print("\n-- Post-update rows:")
display(post_rows)
post_rows.to_csv("task3_post_update_rows.csv", index=False)

post_avg = con.execute("SELECT ROUND(AVG(BETA),6) AS avg_beta_post FROM barra2007;").df()
print("Post-update avg beta:", float(post_avg['avg_beta_post'].iloc[0]))

con.execute("ROLLBACK;")
print("\nTransaction rolled back.")

restored = con.execute("""
    SELECT BARRID, NAME, BETA, IND1
    FROM barra2007
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""").df()
print("\n== 3E: Restored rows after rollback ==")
display(restored)
restored.to_csv("task3_restored_rows.csv", index=False)

print("\n=== 3F: Sector-level delta demo ===")
con.execute("CREATE OR REPLACE TABLE barra2007_work AS SELECT * FROM barra2007;")
con.execute("""
    UPDATE barra2007_work
    SET BETA = 5.0
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""")

sector_after = con.execute("""
    SELECT IND1, AVG(BETA) AS avg_beta_after
    FROM barra2007_work
    GROUP BY IND1
    ORDER BY IND1;
""").df()

sector_compare = baseline_sector.merge(sector_after, on="IND1", how="outer").fillna(0)
sector_compare["delta"] = (sector_compare["avg_beta_after"] - sector_compare["avg_beta_2007"]).round(6)

if df_map is not None and "US_E3_INDUSTRY_CODE" in df_map.columns and "SECTOR" in df_map.columns:
    sector_compare = sector_compare.merge(
        df_map[["US_E3_INDUSTRY_CODE","SECTOR"]].rename(columns={"US_E3_INDUSTRY_CODE":"IND1"}),
        on="IND1", how="left"
    )
    cols = ["SECTOR","IND1","avg_beta_2007","avg_beta_after","delta"]
    sector_compare = sector_compare[cols]

print("\n== Sector-level comparison (after simulated shock) ==")
display(sector_compare.head(15))
sector_compare.to_csv("task3_sector_delta.csv", index=False)

print("\n=== BONUS: Partial rollback demo (manual) ===")

con.execute("CREATE OR REPLACE TABLE barra2007_work AS SELECT * FROM barra2007;")

con.execute("""
    UPDATE barra2007_work
    SET BETA = 5.0
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""")

con.execute("""
    UPDATE barra2007_work
    SET BETA = 10.0
    WHERE NAME ILIKE '%LEHMAN%';
""")

print("\n-- After Lehman escalation:")
display(con.execute("""
    SELECT BARRID, NAME, BETA
    FROM barra2007_work
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""").df())

con.execute("""
    UPDATE barra2007_work w
    SET BETA = b.BETA
    FROM backup_bears b
    WHERE w.BARRID = b.BARRID AND w.NAME ILIKE '%LEHMAN%';
""")

print("\n-- After manual rollback of Lehman (Bear Stearns stays at 5.0):")
display(con.execute("""
    SELECT BARRID, NAME, BETA
    FROM barra2007_work
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""").df())


print("\n=== BONUS: COMMIT demo (safe copy) ===")
con.execute("CREATE OR REPLACE TABLE barra2007_work2 AS SELECT * FROM barra2007;")
con.execute("BEGIN TRANSACTION;")
con.execute("""
    UPDATE barra2007_work2
    SET BETA = 5.0
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""")
con.execute("COMMIT;")

committed = con.execute("""
    SELECT BARRID, NAME, BETA
    FROM barra2007_work2
    WHERE BARRID IN (SELECT BARRID FROM backup_bears);
""").df()
print("\nCommitted changes in barra2007_work2 (original unchanged).")
display(committed)

con.execute("""
    CREATE OR REPLACE TABLE task3_summary AS
    SELECT 'baseline' AS phase, AVG(BETA) AS avg_beta FROM barra2007
    UNION ALL
    SELECT 'work_committed' AS phase, AVG(BETA) FROM barra2007_work2;
""")
summary_df = con.execute("SELECT * FROM task3_summary;").df()
print("\n== BONUS: Summary table ==")
display(summary_df)
con.execute("COPY (SELECT * FROM task3_summary) TO 'task3_summary.csv' (HEADER, DELIMITER ',');")
print("Created task3_summary.csv (baseline vs committed averages).")


⬆️ Please upload: USE3L0712.RSK, USE3L0812.RSK (and optionally Industry_to_Sector_Mapping_CLEAN.csv)


Saving USE3L0712.RSK to USE3L0712.RSK
Saving USE3L0812.RSK to USE3L0812.RSK
2007 rows: 10214  | columns: 49
2008 rows: 9625  | columns: 49

Database connected & tables created.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,BARRID,VARCHAR,False,,False
1,1,TICKER,VARCHAR,False,,False
2,2,CUSIP,VARCHAR,False,,False
3,3,NAME,VARCHAR,False,,False
4,4,HBTA,DOUBLE,False,,False
5,5,BETA,DOUBLE,False,,False
6,6,SRISK_PCT,DOUBLE,False,,False
7,7,TRISK_PCT,DOUBLE,False,,False
8,8,VOLTILTY,DOUBLE,False,,False
9,9,MOMENTUM,DOUBLE,False,,False



== 3A: Identified target rows ==


Unnamed: 0,BARRID,NAME,BETA,IND1
0,USABM91,BEAR STEARNS COS INC,2.019,51
1,USAI521,LEHMAN T H &CO INC,0.619,45
2,USARHS2,LEHMAN BROS HLDGS INC,1.89,51



== 3B: Backup rows saved ==


Unnamed: 0,BARRID,NAME,BETA,IND1
0,USABM91,BEAR STEARNS COS INC,2.019,51
1,USAI521,LEHMAN T H &CO INC,0.619,45
2,USARHS2,LEHMAN BROS HLDGS INC,1.89,51



== 3C: Baseline avg beta (pre-update): 1.192051

Saved baseline sector averages (by IND1).


Unnamed: 0,IND1,avg_beta_2007
0,1,1.607389
1,2,0.845117
2,3,1.327789
3,4,1.256273
4,5,1.20995
5,6,0.924562
6,7,0.970636
7,8,0.824806
8,9,0.878366
9,10,0.69475



=== 3D: TRANSACTION DEMO (ROLLBACK) ===

-- Post-update rows:


Unnamed: 0,BARRID,NAME,BETA,IND1
0,USABM91,BEAR STEARNS COS INC,5.0,51
1,USAI521,LEHMAN T H &CO INC,5.0,45
2,USARHS2,LEHMAN BROS HLDGS INC,5.0,51


Post-update avg beta: 1.193077

Transaction rolled back.

== 3E: Restored rows after rollback ==


Unnamed: 0,BARRID,NAME,BETA,IND1
0,USABM91,BEAR STEARNS COS INC,2.019,51
1,USAI521,LEHMAN T H &CO INC,0.619,45
2,USARHS2,LEHMAN BROS HLDGS INC,1.89,51



=== 3F: Sector-level delta demo ===

== Sector-level comparison (after simulated shock) ==


Unnamed: 0,IND1,avg_beta_2007,avg_beta_after,delta
0,1,1.607389,1.607389,0.0
1,2,0.845117,0.845117,0.0
2,3,1.327789,1.327789,0.0
3,4,1.256273,1.256273,0.0
4,5,1.20995,1.20995,0.0
5,6,0.924562,0.924562,0.0
6,7,0.970636,0.970636,0.0
7,8,0.824806,0.824806,0.0
8,9,0.878366,0.878366,0.0
9,10,0.69475,0.69475,0.0



=== BONUS: Partial rollback demo (manual) ===

-- After Lehman escalation:


Unnamed: 0,BARRID,NAME,BETA
0,USABM91,BEAR STEARNS COS INC,5.0
1,USAI521,LEHMAN T H &CO INC,10.0
2,USARHS2,LEHMAN BROS HLDGS INC,10.0



-- After manual rollback of Lehman (Bear Stearns stays at 5.0):


Unnamed: 0,BARRID,NAME,BETA
0,USABM91,BEAR STEARNS COS INC,5.0
1,USAI521,LEHMAN T H &CO INC,0.619
2,USARHS2,LEHMAN BROS HLDGS INC,1.89



=== BONUS: COMMIT demo (safe copy) ===

Committed changes in barra2007_work2 (original unchanged).


Unnamed: 0,BARRID,NAME,BETA
0,USABM91,BEAR STEARNS COS INC,5.0
1,USAI521,LEHMAN T H &CO INC,5.0
2,USARHS2,LEHMAN BROS HLDGS INC,5.0



== BONUS: Summary table ==


Unnamed: 0,phase,avg_beta
0,baseline,1.192051
1,work_committed,1.193077


Created task3_summary.csv (baseline vs committed averages).
