# 00 – Generate synthetic corporate rating panel

This notebook creates a **synthetic panel dataset** of large corporate
obligors with:

- quarterly snapshots over several years
- internal ratings (1–10) and rating bands (Investment_Grade, Sub_IG, Watchlist, Default)
- financial / behavioural early-warning features (leverage, interest coverage, days past due)
- default and watchlist flags
- rating migration over time

Output: `../data/corp_rating_panel.csv`

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)
pd.set_option("display.max_columns", None)


In [2]:
N_GROUPS = 80          # corporate groups
N_OBLIGORS = 300       # legal entities
YEARS = [2020, 2021, 2022, 2023]
QUARTERS = [3, 6, 9, 12]  # month numbers for Q1/Q2/Q3/Q4

sectors = [
    "Manufacturing",
    "TMT",
    "Retail",
    "Real Estate",
    "Transport",
    "Utilities",
    "Energy",
]

countries = ["JP", "CN", "BR", "US", "DE", "FR", "UK", "IT", "MX", "ES"]


In [3]:
def rating_band_from_numeric(r):
    """
    Internal rating: 1–10 (1 = best, 10 = worst).
    Map to rating band.
    """
    if r <= 4:
        return "Investment_Grade"
    elif r <= 7:
        return "Sub_IG"
    elif r <= 9:
        return "Watchlist"
    else:
        return "Default"


In [4]:
# Create corporate groups
group_ids = np.arange(1, N_GROUPS + 1)
group_names = [f"Group_{g:03d}" for g in group_ids]

groups_df = pd.DataFrame(
    {
        "group_id": group_ids,
        "group_name": group_names,
        "sector": np.random.choice(sectors, size=N_GROUPS),
        "country": np.random.choice(countries, size=N_GROUPS),
    }
)

groups_df.head()


Unnamed: 0,group_id,group_name,sector,country
0,1,Group_001,Energy,IT
1,2,Group_002,Real Estate,US
2,3,Group_003,Transport,CN
3,4,Group_004,Energy,FR
4,5,Group_005,Retail,FR


In [5]:
# Create corporate groups
group_ids = np.arange(1, N_GROUPS + 1)
group_names = [f"Group_{g:03d}" for g in group_ids]

groups_df = pd.DataFrame(
    {
        "group_id": group_ids,
        "group_name": group_names,
        "sector": np.random.choice(sectors, size=N_GROUPS),
        "country": np.random.choice(countries, size=N_GROUPS),
    }
)

groups_df.head()


Unnamed: 0,group_id,group_name,sector,country
0,1,Group_001,Real Estate,UK
1,2,Group_002,TMT,IT
2,3,Group_003,TMT,JP
3,4,Group_004,Transport,FR
4,5,Group_005,Utilities,IT


In [6]:
# Create obligors and assign each to a group
obligor_ids = np.arange(1, N_OBLIGORS + 1)
obligor_names = [f"Obligor_{i:04d}" for i in obligor_ids]

obligors_df = pd.DataFrame(
    {
        "obligor_id": obligor_ids,
        "obligor_name": obligor_names,
        "group_id": np.random.choice(group_ids, size=N_OBLIGORS),
    }
)

# Join sector / country from groups
obligors_df = obligors_df.merge(groups_df[["group_id", "group_name", "sector", "country"]],
                                on="group_id", how="left")

obligors_df.head()


Unnamed: 0,obligor_id,obligor_name,group_id,group_name,sector,country
0,1,Obligor_0001,8,Group_008,Utilities,CN
1,2,Obligor_0002,7,Group_007,Transport,US
2,3,Obligor_0003,67,Group_067,Utilities,BR
3,4,Obligor_0004,17,Group_017,TMT,BR
4,5,Obligor_0005,33,Group_033,Energy,CN


In [9]:
snapshots = []
for year in YEARS:
    for month in QUARTERS:
        snapshots.append(pd.Timestamp(year=year, month=month, day=30))

snapshots = sorted(snapshots)
len(snapshots), snapshots[:4]


(16,
 [Timestamp('2020-03-30 00:00:00'),
  Timestamp('2020-06-30 00:00:00'),
  Timestamp('2020-09-30 00:00:00'),
  Timestamp('2020-12-30 00:00:00')])

In [10]:
panel = (
    pd.MultiIndex.from_product(
        [obligors_df["obligor_id"], snapshots],
        names=["obligor_id", "as_of_date"]
    )
    .to_frame(index=False)
)

panel.shape, panel.head()


((4800, 2),
    obligor_id as_of_date
 0           1 2020-03-30
 1           1 2020-06-30
 2           1 2020-09-30
 3           1 2020-12-30
 4           1 2021-03-30)

In [11]:
df = panel.merge(obligors_df, on="obligor_id", how="left")
df.head()


Unnamed: 0,obligor_id,as_of_date,obligor_name,group_id,group_name,sector,country
0,1,2020-03-30,Obligor_0001,8,Group_008,Utilities,CN
1,1,2020-06-30,Obligor_0001,8,Group_008,Utilities,CN
2,1,2020-09-30,Obligor_0001,8,Group_008,Utilities,CN
3,1,2020-12-30,Obligor_0001,8,Group_008,Utilities,CN
4,1,2021-03-30,Obligor_0001,8,Group_008,Utilities,CN


In [12]:
first_date = snapshots[0]

# Base rating distribution (more in 3–7 range)
base_ratings = np.random.choice(
    [2, 3, 4, 5, 6, 7, 8, 9],
    size=N_OBLIGORS,
    p=[0.05, 0.12, 0.18, 0.20, 0.18, 0.14, 0.08, 0.05]
)

base_rating_df = pd.DataFrame(
    {
        "obligor_id": obligors_df["obligor_id"],
        "rating_numeric": base_ratings,
    }
)

base_rating_df["rating_band"] = base_rating_df["rating_numeric"].apply(rating_band_from_numeric)

base_rating_df.head()


Unnamed: 0,obligor_id,rating_numeric,rating_band
0,1,5,Sub_IG
1,2,7,Sub_IG
2,3,9,Watchlist
3,4,6,Sub_IG
4,5,4,Investment_Grade


In [13]:
def simulate_rating_path(base_rating, n_periods, downgrade_bias=0.05):
    """
    Simulate a path of length n_periods for one obligor.
    downgrade_bias > 0 makes downgrades slightly more likely than upgrades.
    """
    ratings = [base_rating]
    for t in range(1, n_periods):
        current = ratings[-1]

        if current == 10:
            # Once defaulted, stay in default
            ratings.append(10)
            continue

        # base probabilities for change: -1, 0, +1
        # more probability mass on 0
        moves = np.array([-1, 0, 1])
        probs = np.array([0.15, 0.70, 0.15])

        # add small downgrade bias
        probs = probs + np.array([-downgrade_bias, 0, downgrade_bias])
        probs = np.clip(probs, 0.01, 0.98)
        probs = probs / probs.sum()

        move = np.random.choice(moves, p=probs)

        new_rating = current + move
        new_rating = min(max(new_rating, 1), 10)

        # small chance of jump to default from very weak names (8–9)
        if current >= 8 and np.random.rand() < 0.02:
            new_rating = 10

        ratings.append(new_rating)

    return ratings


In [14]:
n_periods = len(snapshots)

rating_records = []

for _, row in obligors_df.iterrows():
    oid = row["obligor_id"]
    base_rating = base_rating_df.loc[base_rating_df["obligor_id"] == oid, "rating_numeric"].iloc[0]

    path = simulate_rating_path(base_rating, n_periods=n_periods, downgrade_bias=0.05)

    for t, as_of in enumerate(snapshots):
        rating_records.append(
            {"obligor_id": oid, "as_of_date": as_of, "rating_numeric": path[t]}
        )

rating_df = pd.DataFrame(rating_records)
rating_df["rating_band"] = rating_df["rating_numeric"].apply(rating_band_from_numeric)

rating_df.head()


Unnamed: 0,obligor_id,as_of_date,rating_numeric,rating_band
0,1,2020-03-30,5,Sub_IG
1,1,2020-06-30,5,Sub_IG
2,1,2020-09-30,5,Sub_IG
3,1,2020-12-30,5,Sub_IG
4,1,2021-03-30,5,Sub_IG


In [15]:
df = df.merge(rating_df, on=["obligor_id", "as_of_date"], how="left")

# Normalize rating to risk score (higher = riskier)
df["risk_score"] = df["rating_numeric"] / 10.0

# Leverage: higher for riskier names
base_leverage = np.random.normal(loc=3.0, scale=0.7, size=len(df))  # debt / EBITDA
df["leverage"] = base_leverage + 2.0 * df["risk_score"]
df["leverage"] = df["leverage"].clip(lower=0.5)

# Interest coverage: lower for riskier names
base_icr = np.random.normal(loc=5.0, scale=1.0, size=len(df))
df["interest_coverage"] = base_icr - 3.0 * df["risk_score"]
df["interest_coverage"] = df["interest_coverage"].clip(lower=0.1)

# EBITDA margin: lower for riskier names
base_margin = np.random.normal(loc=0.18, scale=0.05, size=len(df))
df["ebitda_margin"] = base_margin - 0.08 * df["risk_score"]
df["ebitda_margin"] = df["ebitda_margin"].clip(lower=-0.05, upper=0.35)

# Days past due: mostly 0, but higher for weak ratings
base_dpd = np.random.poisson(lam=1.0, size=len(df))
extra_dpd = (df["rating_numeric"] >= 7).astype(int) * np.random.poisson(lam=5.0, size=len(df))
df["days_past_due"] = base_dpd + extra_dpd


In [16]:
# exposure_at_default proxy ~ outstanding amount
base_ead = np.random.lognormal(mean=13.0, sigma=0.7, size=len(df))  # in currency units
df["exposure_at_default"] = base_ead / 1e6  # store in millions for readability

# default flag: rating 10 or high risk_score with dpd
df["is_default"] = ((df["rating_numeric"] == 10) |
                    ((df["rating_numeric"] >= 8) & (df["days_past_due"] > 30))).astype(int)

# watchlist flag: rating 8–9 but not default
df["is_watchlist"] = (
    (df["rating_band"] == "Watchlist") & (df["is_default"] == 0)
).astype(int)


In [17]:
df = df.sort_values(["obligor_id", "as_of_date"])

df["prev_rating"] = df.groupby("obligor_id")["rating_numeric"].shift(1)
df["rating_change"] = df["rating_numeric"] - df["prev_rating"]

df["is_downgrade"] = (df["rating_change"] > 0).astype(int)
df["is_upgrade"] = (df["rating_change"] < 0).astype(int)


In [18]:
# Reorder / select columns
cols = [
    "obligor_id",
    "obligor_name",
    "group_id",
    "group_name",
    "country",
    "sector",
    "as_of_date",
    "rating_numeric",
    "rating_band",
    "prev_rating",
    "rating_change",
    "is_upgrade",
    "is_downgrade",
    "risk_score",
    "leverage",
    "interest_coverage",
    "ebitda_margin",
    "days_past_due",
    "exposure_at_default",
    "is_watchlist",
    "is_default",
]

df_final = df[cols].copy()

data_dir = Path("../data")
data_dir.mkdir(exist_ok=True)

output_path = data_dir / "corp_rating_panel.csv"
df_final.to_csv(output_path, index=False)

output_path, df_final.shape


(WindowsPath('../data/corp_rating_panel.csv'), (4800, 21))

## Output

This notebook generates a synthetic **corporate rating panel** and saves it as:

- `../data/corp_rating_panel.csv`

Each row is an **obligor–quarter** with:

- internal rating (1–10) and rating_band (Investment_Grade / Sub_IG / Watchlist / Default)
- early-warning features (leverage, interest coverage, EBITDA margin, days past due)
- exposure_at_default (in millions)
- default and watchlist flags
- rating_change, is_upgrade, is_downgrade versus previous quarter

This dataset will be used in:

- `01_eda_rating_migration.ipynb` – rating distributions, migration matrices, and
  default rates by rating band.
- `02_early_warning_model.ipynb` – baseline model to flag high-risk obligors
  based on ratings and early-warning features.
