# Population Stability Index (PSI)
- skip_exec: true

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from fastai.tabular.all import add_datepart, cont_cat_split, TabularPandas, Categorify, FillMissing

In [None]:
df = pd.read_csv(f'../data/bluebook-for-bulldozers/TrainAndValid.csv', low_memory=False, parse_dates=["saledate"])

In [None]:
df["SalePrice"] = np.log(df["SalePrice"])

In [None]:
df = add_datepart(df, 'saledate', drop=False)

In [None]:
df['ProductSize'] = df['ProductSize'].astype('category')
df["ProductSize"] = df['ProductSize'].cat.set_categories(
    ['Compact','Mini','Small','Medium','Large / Medium','Large'], 
    ordered=True
)

df['UsageBand'] = df['UsageBand'].astype('category')
df["UsageBand"] = df["UsageBand"].cat.set_categories(['Low','Medium','High'], ordered=True)

In [None]:
conts, cats = cont_cat_split(df.drop(columns=["saledate"]), max_card=20, dep_var='SalePrice')

In [None]:
procs = [Categorify, FillMissing]

In [None]:
cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where( cond)[0]
valid_idx = np.where(~cond)[0]
splits = (list(train_idx),list(valid_idx))

In [None]:
to = TabularPandas(df.drop(columns=["saledate"]), procs, cats, conts, y_names="SalePrice", splits=splits)

In [None]:
len(conts), len(cats)

(11, 55)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(to.train.xs, to.train.y)

In [None]:
m.score(to.train.xs, to.train.y), m.score(to.valid.xs, to.valid.y)

(0.9880962215993305, 0.8938477733941156)

In [None]:
m.predict(to.train.xs)

array([11.08561131, 10.93323544,  9.28840471, ...,  9.4279605 ,
        9.27903488,  9.51336108])

In [None]:
def psi(score_initial, score_new, num_bins=10, mode="fixed", eps=1e-4):
    score_initial.sort()
    score_new.sort()
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == "fixed":
        bins = [min_val + (max_val - min_val) * (i) / num_bins for i in range(num_bins + 1)]
    elif mode == "quantile":
        bins = pd.qcut(score_initial, q=num_bins, retbins=True, duplicates="drop")[1]
    else:
        raise ValueError(f"Mode '{mode}' not recognized. Your options are 'fixed' and 'quantile'")
    bins[0] = min_val - eps  # Correct the lower boundary
    bins[-1] = max_val + eps  # Correct the higher boundary
    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins=bins, labels=range(1, len(bins)))
    df_initial = pd.DataFrame({"initial": score_initial, "bin": bins_initial})
    grp_initial = df_initial.groupby("bin").count()
    grp_initial["percent_initial"] = grp_initial["initial"] / sum(grp_initial["initial"])
    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins=bins, labels=range(1, len(bins)))
    df_new = pd.DataFrame({"new": score_new, "bin": bins_new})
    grp_new = df_new.groupby("bin").count()
    grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"])
    # Compare the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on="bin", how="inner")
    # Add a small value for when the percent is zero
    psi_df["percent_initial"] = psi_df["percent_initial"].apply(lambda x: eps if x == 0 else x)
    psi_df["percent_new"] = psi_df["percent_new"].apply(lambda x: eps if x == 0 else x)
    # Calculate the psi
    psi_df["psi"] = (psi_df["percent_initial"] - psi_df["percent_new"]) * np.log(
        psi_df["percent_initial"] / psi_df["percent_new"]
    )
    return psi_df["psi"].values

In [None]:
psi_values_fixed = psi(m.predict(to.train.xs), m.predict(to.valid.xs), mode="fixed")
np.mean(psi_values_fixed)

0.005366647875130087

In [None]:
psi_values_quantile= psi(m.predict(to.train.xs), m.predict(to.valid.xs), mode="quantile")
np.mean(psi_values_quantile)

0.00567408744219339