<a href="https://colab.research.google.com/github/Krixna-Kant/BharatScore/blob/main/BharatScore_DataGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from scipy.optimize import brentq

np.random.seed(42)
N = 5000
TARGET_DEFAULT_RATE = 0.20

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def calibrate_intercept(target_rate, linear_score):
    """
    Calibrates intercept so that average PD matches target_rate.
    """
    def objective(intercept):
        pd = sigmoid(linear_score + intercept)
        return pd.mean() - target_rate

    intercept = brentq(objective, -10, 10)
    return intercept

# Loan categories
loan_categories = ["education", "farmer", "startup", "personal"]

# User characteristics
user_type = np.random.choice(["smartphone", "feature_phone"], size=N, p=[0.7, 0.3])
region = np.random.choice(["urban", "rural"], size=N, p=[0.6, 0.4])
age_group = np.random.choice(["18-30", "31-50", "51-70"], size=N, p=[0.4, 0.4, 0.2])

In [4]:
sms_count = np.where(
    user_type == "smartphone",
    np.random.poisson(30, size=N),
    np.random.poisson(8, size=N)
)

bill_on_time_ratio = np.clip(np.random.beta(5, 2, N), 0, 1)

recharge_pattern = np.random.choice(
    ["always_on_time", "sometimes_late", "often_late"],
    size=N,
    p=[0.6, 0.3, 0.1]
)
recharge_freq = np.where(
    recharge_pattern == "always_on_time", 1.0,
    np.where(recharge_pattern == "sometimes_late", 0.5, 0.2)
)

sim_tenure = np.random.randint(1, 121, N)

location_stability = np.where(
    region == "urban",
    np.clip(np.random.normal(0.8, 0.1, N), 0, 1),
    np.clip(np.random.normal(0.6, 0.15, N), 0, 1)
)

In [5]:
# SMS count per month (proxy for activity)
sms_count = np.where(
    user_type == "smartphone",
    np.random.poisson(30, size=N),   # higher for smartphone
    np.random.poisson(8, size=N)     # lower for feature phone
)

# Bill on time ratio (last 6 months)
bill_on_time_ratio = np.clip(np.random.beta(5, 2, N), 0, 1)

# Recharge frequency proxy (per month or pattern)
# Instead of raw count → categorical mapped to score
recharge_pattern = np.random.choice(
    ["always_on_time", "sometimes_late", "often_late"],
    size=N,
    p=[0.6, 0.3, 0.1]
)
recharge_freq = np.where(
    recharge_pattern == "always_on_time", 1.0,
    np.where(recharge_pattern == "sometimes_late", 0.5, 0.2)
)

# SIM tenure (months since activation)
sim_tenure = np.random.randint(1, 121, N)   # 1 to 10 years

# Location stability (past 12 months)
location_stability = np.where(
    region == "urban",
    np.clip(np.random.normal(0.8, 0.1, N), 0, 1),
    np.clip(np.random.normal(0.6, 0.15, N), 0, 1)
)

In [6]:
income_signal = np.clip(bill_on_time_ratio + np.random.normal(0, 0.1, N), 0, 1)

coop_score = np.clip(np.random.normal(65, 15, N), 0, 100)

land_verified = np.where(region == "rural", np.random.binomial(1, 0.35, N), np.random.binomial(1, 0.1, N))

In [7]:
psychometric_score = np.clip(np.random.normal(0.6, 0.15, N), 0, 1)

In [8]:
loan_amount_requested = np.random.randint(10000, 500000, N)
loan_category = np.random.choice(loan_categories, size=N)

In [9]:
weights = {
    "sms": -0.6,
    "bill": -2.0,
    "recharge": -1.2,
    "sim": -0.3,
    "loc": -1.5,
    "income": -2.5,
    "coop": -0.01,
    "land": -0.8,
    "psych": -2.8
}

linear_score = (
    weights["sms"] * (sms_count / (sms_count.max() + 1))
    + weights["bill"] * bill_on_time_ratio
    + weights["recharge"] * recharge_freq
    + weights["sim"] * (sim_tenure / 120)
    + weights["loc"] * location_stability
    + weights["income"] * income_signal
    + weights["coop"] * (coop_score / 100)
    + weights["land"] * land_verified
    + weights["psych"] * psychometric_score
)

In [10]:
# Recharge penalty
linear_score += np.where(recharge_pattern == "often_late", -1.5, 0)

# Rural + low coop score interaction
linear_score += -1.0 * ((region == "rural") & (coop_score < 50))

# Income threshold effect
linear_score += np.where(income_signal < 0.3, -2, 0)

In [11]:
# Finding intercept so PD ~ target default rate
calibrated_intercept = calibrate_intercept(TARGET_DEFAULT_RATE, linear_score)
linear_score = linear_score + calibrated_intercept

pd_values = np.clip(sigmoid(linear_score), 0.0001, 0.9999)
default_label = (np.random.rand(N) < pd_values).astype(int)

In [12]:
ids = [f"USR_{i:05d}" for i in range(1, N+1)]

df = pd.DataFrame({
    "id": ids,
    "user_type": user_type,
    "region": region,
    "age_group": age_group,
    "sms_count": sms_count,
    "bill_on_time_ratio": bill_on_time_ratio,
    "recharge_pattern": recharge_pattern,
    "recharge_freq": recharge_freq,
    "sim_tenure": sim_tenure,
    "location_stability": location_stability,
    "income_signal": income_signal,
    "coop_score": coop_score,
    "land_verified": land_verified,
    "psychometric_score": psychometric_score,
    "loan_amount_requested": loan_amount_requested,
    "loan_category": loan_category,
    "pd": pd_values,
    "default": default_label
})

df.to_csv("bharatscore_final_dataset.csv", index=False)

print("Synthetic dataset generated and saved as bharatscore_final_dataset.csv")
print("Shape:", df.shape)
print(f"\nDefault Rate: {df['default'].mean():.2%}")
print("\nSample Rows:")
print(df.head())

Synthetic dataset generated and saved as bharatscore_final_dataset.csv
Shape: (5000, 18)

Default Rate: 19.96%

Sample Rows:
          id      user_type region age_group  sms_count  bill_on_time_ratio  \
0  USR_00001     smartphone  urban     18-30         31            0.443740   
1  USR_00002  feature_phone  urban     18-30          7            0.563497   
2  USR_00003  feature_phone  rural     18-30          3            0.839333   
3  USR_00004     smartphone  urban     31-50         26            0.923190   
4  USR_00005     smartphone  rural     31-50         22            0.304195   

  recharge_pattern  recharge_freq  sim_tenure  location_stability  \
0   sometimes_late            0.5         109            0.865462   
1       often_late            0.2          54            0.840071   
2   always_on_time            1.0          92            0.815628   
3       often_late            0.2          68            0.854602   
4   always_on_time            1.0         103          

In [13]:
df['default'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
default,Unnamed: 1_level_1
0,0.8004
1,0.1996


In [14]:
# Checking default rate
default_rate = df['default'].mean()

print(f"Default Rate: {default_rate:.2%}")

# If imbalanced, regenerate with slight noise adjustment
if default_rate < 0.1 or default_rate > 0.3:
    print("Default rate outside desired range (10–30%). Adjusting noise...")

    # Adjust intercept to shift probabilities
    adjust = -np.log((0.2 / (1-0.2)))  # target ~20%
    linear_score = linear_score + adjust
    pd_values = np.clip(sigmoid(linear_score), 0.0001, 0.9999)
    default_label = (np.random.rand(N) < pd_values).astype(int)

    df['pd'] = pd_values
    df['default'] = default_label

    print(f"Adjusted Default Rate: {df['default'].mean():.2%}")


Default Rate: 19.96%
