In [1]:
import numpy as np
import pandas as pd

# Define PSI calculation function
def calculate_psi(expected, actual, buckettype='bins', n_bins=10):
    """
    Calculate the Population Stability Index (PSI) between two samples.

    Parameters:
        expected (array-like): Reference distribution values
        actual (array-like): New distribution values to compare
        buckettype (str): 'bins' for equal-width bins; 'quantiles' for equal-frequency bins
        n_bins (int): Number of bins to use

    Returns:
        float: PSI value
    """
    expected = np.array(expected)
    actual = np.array(actual)

    if buckettype == 'bins':
        breakpoints = np.linspace(np.min(expected), np.max(expected), n_bins + 1)
    elif buckettype == 'quantiles':
        breakpoints = np.quantile(expected, np.linspace(0, 1, n_bins + 1))
    else:
        raise ValueError("buckettype must be 'bins' or 'quantiles'")

    # Compute distribution in each bin
    exp_counts = np.histogram(expected, bins=breakpoints)[0] / len(expected)
    act_counts = np.histogram(actual, bins=breakpoints)[0] / len(actual)

    # Avoid division by zero or log of zero
    eps = 1e-6
    exp_counts = np.where(exp_counts == 0, eps, exp_counts)
    act_counts = np.where(act_counts == 0, eps, act_counts)

    # Calculate PSI components and sum
    psi_vals = (exp_counts - act_counts) * np.log(exp_counts / act_counts)
    return np.sum(psi_vals)

In [2]:
# Load and prepare data
df = pd.read_csv('weather_data.csv')
df['Date_Time'] = pd.to_datetime(df['Date_Time'])
df['Day'] = df['Date_Time'].dt.day

In [3]:
# Separate temperatures for Day 1 and Day 2
temp_day1 = df[df['Day'] == 1]['Temperature_C']
temp_day2 = df[df['Day'] == 2]['Temperature_C']

In [4]:
# Calculate PSI for both binning strategies
psi_bins = calculate_psi(temp_day1, temp_day2, buckettype='bins', n_bins=10)
psi_quant = calculate_psi(temp_day1, temp_day2, buckettype='quantiles', n_bins=10)

In [5]:
print(f"PSI (equal-width bins): {psi_bins:.4f}")
print(f"PSI (equal-frequency bins): {psi_quant:.4f}")

PSI (equal-width bins): 0.0006
PSI (equal-frequency bins): 0.0006


In [6]:
# Cap outlier temperatures on Day 2 at 30°C and recompute PSI
temp_day2_capped = temp_day2.clip(upper=30)
psi_bins_capped = calculate_psi(temp_day1, temp_day2_capped, buckettype='bins', n_bins=10)
psi_quant_capped = calculate_psi(temp_day1, temp_day2_capped, buckettype='quantiles', n_bins=10)

In [7]:
print("\nAfter capping Day 2 temperatures > 30°C:")
print(f"PSI (equal-width bins): {psi_bins_capped:.4f}")
print(f"PSI (equal-frequency bins): {psi_quant_capped:.4f}")


After capping Day 2 temperatures > 30°C:
PSI (equal-width bins): 1.4184
PSI (equal-frequency bins): 1.2157
