# Project 1 – Synthetic corporate exposure data generator

This notebook creates a synthetic `corp_exposure_snapshot.csv` file that mimics a
wholesale / corporate loan book: multiple facilities per obligor, grouped by
corporate group, with countries, sectors, and internal ratings.


In [1]:
# 00_generate_corp_exposure_data.ipynb

import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)

# -------------------------------------------------------------------
# 1. Parameters & lookup tables
# -------------------------------------------------------------------
n_groups = 80        # corporate groups
n_obligors = 200     # individual obligors
n_facilities = 1000  # facilities

regions = ["North America", "EMEA", "APAC", "LATAM"]
countries = ["US", "DE", "FR", "UK", "IT", "ES", "CN", "JP", "BR", "MX", "KZ"]
sectors = ["Manufacturing", "Energy", "Real_Estate", "TMT", "Utilities", "Retail", "Transport"]
product_types = ["Term_Loan", "Revolver", "Guarantee", "Derivatives"]
currencies = ["USD", "EUR", "GBP"]

# Make sure data folder exists
data_dir = Path("../data")
data_dir.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------------
# 2. Create corporate groups
# -------------------------------------------------------------------
group_ids = np.arange(1, n_groups + 1)
group_names = [f"Group_{i:03d}" for i in group_ids]
group_region = np.random.choice(regions, size=n_groups)
group_country = np.random.choice(countries, size=n_groups)
group_sector = np.random.choice(sectors, size=n_groups)

groups = pd.DataFrame({
    "group_id": group_ids,
    "group_name": group_names,
    "region": group_region,
    "country": group_country,
    "sector": group_sector,
})

groups.head()

# -------------------------------------------------------------------
# 3. Create obligors mapped to groups
# -------------------------------------------------------------------
obligor_ids = np.arange(1, n_obligors + 1)
obligor_group = np.random.choice(group_ids, size=n_obligors)
obligor_names = [f"Obligor_{i:04d}" for i in obligor_ids]

obligors = (
    pd.DataFrame({
        "obligor_id": obligor_ids,
        "obligor_name": obligor_names,
        "group_id": obligor_group,
    })
    .merge(groups, on="group_id", how="left")
)

obligors.head()

# -------------------------------------------------------------------
# 4. Create facilities (limits, utilization, ratings, dates, etc.)
# -------------------------------------------------------------------
facility_ids = np.arange(1, n_facilities + 1)
facility_obligor = np.random.choice(obligor_ids, size=n_facilities)
facility_product = np.random.choice(product_types, size=n_facilities)
facility_currency = np.random.choice(currencies, size=n_facilities, p=[0.6, 0.3, 0.1])

# Limits: log-normal → round to "nice" millions
limit_raw = np.random.lognormal(mean=13, sigma=0.7, size=n_facilities)  # ~ few M–hundreds of M
limit_amount = np.round(limit_raw / 1_000_000, 2) * 1_000_000  # round to 10k/100k

# Utilization
utilization = np.clip(np.random.normal(loc=0.6, scale=0.2, size=n_facilities), 0, 1)
outstanding_amount = np.round(limit_amount * utilization, 0)
undrawn_amount = np.round(limit_amount - outstanding_amount, 0)

# Internal rating: 1=best, 10=worst (skew better)
internal_rating = np.random.choice(
    np.arange(1, 11),
    size=n_facilities,
    p=[0.10, 0.15, 0.20, 0.18, 0.12, 0.10, 0.06, 0.04, 0.03, 0.02]
)

def map_rating_band(r):
    if r <= 4:
        return "Investment_Grade"
    elif r <= 7:
        return "Sub_IG"
    else:
        return "Watchlist"

rating_band = [map_rating_band(r) for r in internal_rating]

# Dates
dates = pd.date_range("2015-01-01", "2025-12-31", freq="D")
origination_dates = np.random.choice(dates, size=n_facilities)
maturity_offsets = np.random.randint(365, 365*7, size=n_facilities)  # 1–7 years
maturity_dates = origination_dates + pd.to_timedelta(maturity_offsets, unit="D")

# Interest rates – higher for worse ratings
base_rate = 0.03
spread = (internal_rating - 1) * 0.004  # ~0.4% per notch
interest_rate = (base_rate + spread + np.random.normal(0, 0.002, size=n_facilities)).clip(0.02, 0.10)

# Collateral
collateral_ratio = np.clip(np.random.normal(loc=0.4, scale=0.25, size=n_facilities), 0, 1.2)
collateral_value = np.round(limit_amount * collateral_ratio, 0)

# Simple default flag – probability increases with rating
base_pd = 0.002
pd_per_grade = 0.003  # add per rating notch
default_prob = base_pd + (internal_rating - 1) * pd_per_grade
default_prob = np.clip(default_prob, 0.001, 0.25)
is_defaulted = (np.random.rand(n_facilities) < default_prob).astype(int)

facilities = pd.DataFrame({
    "facility_id": facility_ids,
    "obligor_id": facility_obligor,
    "product_type": facility_product,
    "currency": facility_currency,
    "limit_amount": limit_amount,
    "outstanding_amount": outstanding_amount,
    "undrawn_amount": undrawn_amount,
    "internal_rating": internal_rating,
    "rating_band": rating_band,
    "origination_date": origination_dates,
    "maturity_date": maturity_dates,
    "interest_rate": np.round(interest_rate, 4),
    "collateral_value": collateral_value,
    "is_defaulted": is_defaulted,
})

facilities.head()

# -------------------------------------------------------------------
# 5. Merge facilities with obligor & group info → final exposure table
# -------------------------------------------------------------------
df = facilities.merge(obligors, on="obligor_id", how="left")

# Reorder columns nicely
df = df[[
    "facility_id",
    "obligor_id",
    "obligor_name",
    "group_id",
    "group_name",
    "country",
    "region",
    "sector",
    "product_type",
    "internal_rating",
    "rating_band",
    "currency",
    "limit_amount",
    "outstanding_amount",
    "undrawn_amount",
    "collateral_value",
    "origination_date",
    "maturity_date",
    "interest_rate",
    "is_defaulted",
]]

df.head(), df.shape

# -------------------------------------------------------------------
# 6. Save corp_exposure_snapshot.csv
# -------------------------------------------------------------------
exposure_path = data_dir / "corp_exposure_snapshot.csv"
df.to_csv(exposure_path, index=False)
print(f"Saved exposure snapshot to: {exposure_path}")

# -------------------------------------------------------------------
# 7. Optional: simple corp_limits.csv (single-name, sector, country limits)
# -------------------------------------------------------------------
# Single-name (group) limits: ~1.5x current outstanding
group_outstanding = df.groupby("group_name")["outstanding_amount"].sum().reset_index()
group_outstanding["limit_amount"] = np.round(group_outstanding["outstanding_amount"] * 1.5, -4)
group_outstanding["limit_type"] = "SingleName"
group_outstanding["dimension_value"] = group_outstanding["group_name"]
group_outstanding["warning_threshold_pct"] = 0.8
group_outstanding["responsible_unit"] = "Large Corporates"
group_outstanding["last_review_date"] = pd.Timestamp("2025-01-01")

single_name_limits = group_outstanding[[
    "limit_type",
    "dimension_value",
    "limit_amount",
    "warning_threshold_pct",
    "responsible_unit",
    "last_review_date",
]]

# Sector limits: ~2x sector outstanding
sector_outstanding = df.groupby("sector")["outstanding_amount"].sum().reset_index()
sector_outstanding["limit_amount"] = np.round(sector_outstanding["outstanding_amount"] * 2.0, -4)
sector_outstanding["limit_type"] = "Sector"
sector_outstanding["dimension_value"] = sector_outstanding["sector"]
sector_outstanding["warning_threshold_pct"] = 0.9
sector_outstanding["responsible_unit"] = "Sector Risk"
sector_outstanding["last_review_date"] = pd.Timestamp("2025-01-01")

sector_limits = sector_outstanding[[
    "limit_type",
    "dimension_value",
    "limit_amount",
    "warning_threshold_pct",
    "responsible_unit",
    "last_review_date",
]]

# Country limits: ~2.5x country outstanding
country_outstanding = df.groupby("country")["outstanding_amount"].sum().reset_index()
country_outstanding["limit_amount"] = np.round(country_outstanding["outstanding_amount"] * 2.5, -4)
country_outstanding["limit_type"] = "Country"
country_outstanding["dimension_value"] = country_outstanding["country"]
country_outstanding["warning_threshold_pct"] = 0.9
country_outstanding["responsible_unit"] = "Country Risk"
country_outstanding["last_review_date"] = pd.Timestamp("2025-01-01")

country_limits = country_outstanding[[
    "limit_type",
    "dimension_value",
    "limit_amount",
    "warning_threshold_pct",
    "responsible_unit",
    "last_review_date",
]]

corp_limits = pd.concat([single_name_limits, sector_limits, country_limits], ignore_index=True)

limits_path = data_dir / "corp_limits.csv"
corp_limits.to_csv(limits_path, index=False)
print(f"Saved limits table to: {limits_path}")


Saved exposure snapshot to: ..\data\corp_exposure_snapshot.csv
Saved limits table to: ..\data\corp_limits.csv
