In [None]:
# Cell 1: Setup and Imports

# --- Core Libraries ---
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# --- Database & System Libraries ---
import os
from dotenv import load_dotenv
import logging
from datetime import datetime, timedelta

# --- Import Custom DB Connector ---
from common.database.MQSDBConnector import MQSDBConnector

# --- Notebook Configuration ---
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
sns.set(style="whitegrid")

# Configure logging for better debugging and tracing.
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

# Load environment variables
load_dotenv()

print("Libraries imported and MQSDBConnector ready.")

In [None]:
# Cell 2: Data Loading and Preparation

# --- 1. Instantiate DB Connector ---
db_connector = MQSDBConnector()

# --- 2. Define Data Fetching Function ---

# Define the SQL query template
# This query must be customized to select your 14 predictive variables (X)
# and your 1 outcome variable (Y) from your database.
MARKET_DATA_QUERY = """
    SELECT 
        timestamp,
        -- outcome_variable (e.g., future_1q_volatility),
        -- predictive_variable_1 (e.g., trailing_1m_volatility),
        -- predictive_variable_2 (e.g., implied_volatility),
        -- ...
        -- predictive_variable_14 (e.g., debt_to_gdp)
    FROM 
        your_aggregated_data_table
    WHERE 
        timestamp::date BETWEEN %s AND %s
        -- Add any other conditions (e.g., specific asset class)
    ORDER BY
        timestamp;
"""

def get_market_data(db, lookback_days) -> pd.DataFrame:
    """Fetches and processes market data for the RBP model."""
    
    end_time = datetime.now()
    start_time = end_time - timedelta(days=lookback_days)
    
    # This query assumes your data is already in a wide format (one row per timestamp)
    # as required by the RBP model.
    sql = MARKET_DATA_QUERY
    params = [start_time.date(), end_time.date()]
    
    logging.info(f"Fetching data from {start_time.date()} to {end_time.date()}...")
    result = db.execute_query(sql, params, fetch=True) # Use fetch=True

    if result['status'] != 'success' or not result.get('data'):
        logging.error("Failed to fetch data or no data returned.")
        return pd.DataFrame()

    df = pd.DataFrame(result['data'])
    logging.info(f"Successfully fetched {len(df)} rows.")

    # --- Data Processing ---
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.set_index('timestamp')
    
    # Convert all columns to numeric, handling potential errors
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Drop rows where critical data might be missing
    df.dropna(inplace=True) 
    
    df.sort_index(inplace=True)
    return df

# --- 3. Fetch Data ---
# model_lookback_days = 365 * 20 # ~20 years of data
# all_data = get_market_data(db_connector, model_lookback_days)

# --- 4. Define Variables and Split Data ---
# 'all_data' should now be your wide-format DataFrame
# with all variables ready for the model.

# outcome_variable = 'your_Y_variable_column_name' 
# predictive_variables = [
#     'var_1_col_name', 'var_2_col_name', 'var_3_col_name',
#     'var_4_col_name', 'var_5_col_name', 'var_6_col_name', 
#     'var_7_col_name', 'var_8_col_name', 'var_9_col_name',
#     'var_10_col_name', 'var_11_col_name', 'var_12_col_name',
#     'var_13_col_name', 'var_14_col_name'
# ]

# --- Split Data ---
# [cite_start]The paper uses a fixed training and testing split [cite: 536-539]
# training_start = '1986-03-31'
# training_end = '2004-12-31'
# testing_start = '2005-03-31'
# testing_end = '2023-12-31'

# X_train = all_data.loc[training_start:training_end, predictive_variables]
# y_train = all_data.loc[training_start:training_end, outcome_variable]
# X_test = all_data.loc[testing_start:testing_end, predictive_variables]
# y_test = all_data.loc[testing_start:testing_end, outcome_variable]

# print(f"Training data shape (X, y): {X_train.shape}, {y_train.shape}")
# print(f"Testing data shape (X, y): {X_test.shape}, {y_test.shape}")
# print(X_train.head())

In [None]:
# Cell 3: Pre-computation on Training Data

# --- Compute Historical Averages and Covariance ---
# These are computed ONCE on the training data.
# These values (x_mean, inv_cov_matrix) are fundamental for all
# Mahalanobis distance calculations.

def compute_training_statistics(X_train_df):
  # [cite_start]1. Calculate the mean vector (x-bar) of the training variables [cite: 149]
  #   x_mean = X_train_df.mean().values
  #
  # 2. Calculate the covariance matrix (Omega) of the training variables
  #   cov_matrix = X_train_df.cov().values
  #
  # [cite_start]3. Calculate the inverse covariance matrix (Omega-inverse) [cite: 149]
  #   This is the key component for Mahalanobis distance
  #   inv_cov_matrix = np.linalg.inv(cov_matrix)
  #
  #   return x_mean, inv_cov_matrix, X_train_df.columns.tolist()
  pass

# --- Store Stats ---
# We compute this for the *full* set of variables.
# Inside the grid, we will re-compute this for *subsets* of variables.
# full_train_stats = {
#     'x_mean': ...,
#     'inv_cov': ...,
#     'vars': ...
# } = compute_training_statistics(X_train)

# print("Training statistics (mean, inv_cov) computed.")

In [None]:
# Cell 4: Core Helper Functions (Relevance)

# These functions implement the core math from the paper.

def calculate_mahalanobis_distance(vec1, vec2, inv_cov_matrix):
  """Calculates the squared Mahalanobis distance."""
  # diff = vec1 - vec2
  # return diff @ inv_cov_matrix @ diff.T
  pass

def calculate_relevance(x_i, x_t, x_mean, inv_cov_matrix):
  """
  Calculates the relevance of a past observation (x_i) to a
  [cite_start]current prediction task (x_t), based on Equation 1[cite: 141].
  """

  # [cite_start]1. Similarity (sim) component, Equation 2 [cite: 142]
  #   sim_component = calculate_mahalanobis_distance(x_i, x_t, inv_cov_matrix)
  #
  # [cite_start]2. Informativeness of past observation (info(x_i)), Equation 3 [cite: 143]
  #   info_i = calculate_mahalanobis_distance(x_i, x_mean, inv_cov_matrix)
  #
  # [cite_start]3. Informativeness of current task (info(x_t)), Equation 4 [cite: 144]
  #   info_t = calculate_mahalanobis_distance(x_t, x_mean, inv_cov_matrix)
  #
  # [cite_start]4. Total Relevance (r_it), Equation 1 [cite: 141]
  #   r_it = (-0.5 * sim_component) + 0.5 * (info_i + info_t)
  #   return r_it
  pass

print("Relevance helper functions defined.")

In [None]:
# Cell 5: Core Helper Functions (Prediction & Fit)

def get_relevance_for_task(x_t, X_train, y_train, x_mean, inv_cov):
  """
  Calculates relevance scores for ALL past observations (x_i)
  against a SINGLE current task (x_t).
  """
  # 1. Iterate through each row (x_i) in X_train
  # 2. Call calculate_relevance(x_i, x_t, ...) for each
  # 3. Return a pandas Series of relevance scores, indexed by X_train.index
  pass

def calculate_prediction_weights(relevance_scores, r_threshold_quantile):
  """
  Calculates observation weights (w_it) based on relevance
  [cite_start]and a censoring threshold, per Equations 7-9 [cite: 169-173].
  """
  # [cite_start]1. Determine the relevance threshold value (r*) [cite: 169]
  #   r_star = relevance_scores.quantile(r_threshold_quantile)
  #
  # [cite_start]2. Identify retained (delta=1) and censored (delta=0) observations [cite: 169]
  #   retained_mask = relevance_scores >= r_star
  #
  # [cite_start]3. Calculate n, N, phi, r_sub_avg, etc. [cite: 175-176]
  #
  # [cite_start]4. Calculate the scaling factor (lambda^2) [cite: 173]
  #
  # [cite_start]5. Calculate final weights (w_it_retained) for all i [cite: 169]
  #    (Censored observations will have a different weight calculation)
  # 6. Return the final vector of weights (w_it)
  pass

def calculate_fit(weights, outcomes):
  """
  Calculates the Fit for a prediction task, which is the
  [cite_start]squared correlation of relevance weights and outcomes, per Eq. 11[cite: 198].
  """
  # 1. Calculate Pearson correlation: rho = np.corrcoef(weights, outcomes)[0, 1]
  # 2. Return rho**2
  pass

def calculate_asymmetry(weights, outcomes, retained_mask):
  """
  [cite_start]Calculates asymmetry per Equation 13[cite: 213].
  """
  # [cite_start]1. Create weights for retained subsample (w_t_plus) [cite: 210]
  # [cite_start]2. Create weights for censored subsample (w_t_minus) [cite: 210]
  # 3. Calculate rho(w_t_plus, y) and rho(w_t_minus, y)
  # 4. Return 0.5 * (rho_plus - rho_minus)**2
  pass

def calculate_adjusted_fit(fit, asymmetry, K):
  """
  [cite_start]Calculates adjusted fit per Equation 14[cite: 219].
  K is the number of predictive variables in this calibration.
  """
  # return K * (fit + asymmetry)
  pass

print("Prediction and Fit helper functions defined.")

In [None]:
# Cell 6: The Grid Prediction Loop (Single Task)

# --- This is the core of RBP ---
# We process ONE prediction task (x_t) at a time.
# We build a grid of predictions and adjusted fits.

def process_grid_for_one_task(x_t, X_train, y_train):
  """
  Builds the entire prediction grid (Exhibit 1) for a
  [cite_start]single prediction task (x_t) [cite: 221-222].
  """

  # --- 1. Define the Grid ---
  # [cite_start]Columns: All combinations of variables [cite: 221]
  # (e.g., from 1 var up to K vars. This can be 2^K - 1)
  # variable_combinations = get_variable_combinations(X_train.columns)
  # [cite_start]To save time, the paper uses a "sparse sampling" method [cite: 544]
  #
  # [cite_start]Rows: Relevance thresholds (e.g., 0, 0.2, 0.5, 0.8) [cite: 221, 541]
  # relevance_thresholds = [0.0, 0.2, 0.5, 0.8]
  #
  # --- 2. Initialize Grid Results ---
  # grid_results = [] # Store (cell_params, prediction, adjusted_fit)
  #
  # --- 3. Iterate through Grid Cells (theta) ---
  # for var_combo in variable_combinations:
  #   K = len(var_combo)
  #
  #   # --- 3a. Setup for this Variable Subset ---
  #   X_train_sub = X_train[var_combo]
  #   x_t_sub = x_t[var_combo]
  #
  #   # Re-compute stats for this *subset*
  #   x_mean_sub, inv_cov_sub, _ = compute_training_statistics(X_train_sub)
  #
  #   # Get relevance scores for this task, using this subset
  #   relevance_scores = get_relevance_for_task(
  #       x_t_sub, X_train_sub, y_train, x_mean_sub, inv_cov_sub
  #   )
  #
  #   for r_thresh in relevance_thresholds:
  #     # --- 3b. Process this Cell (theta) ---
  #     cell_params = {'vars': var_combo, 'thresh': r_thresh}
  #
  #     # 1. Get prediction weights
  #     weights, mask = calculate_prediction_weights(relevance_scores, r_thresh)
  #
  #     # 2. Get cell prediction (y_hat_theta)
  #     y_hat_theta = np.sum(weights * y_train)
  #
  #     # 3. Get cell reliability
  #     fit = calculate_fit(weights, y_train)
  #     asymmetry = calculate_asymmetry(weights, y_train, mask)
  #     adj_fit = calculate_adjusted_fit(fit, asymmetry, K)
  #
  #     [cite_start]# 4. Store cell results [cite: 222]
  #     grid_results.append((cell_params, y_hat_theta, adj_fit))
  #
  # return pd.DataFrame(grid_results, columns=['params', 'prediction', 'adj_fit'])
  pass

# --- Test the function with the first task ---
# x_t_example = X_test.iloc[0]
# grid_for_task_0 = process_grid_for_one_task(x_t_example, X_train, y_train)
#
# print("Grid results for one task:")
# print(grid_for_task_0.head())

In [None]:
# Cell 7: Composite Prediction and RBI (Single Task)

# --- Calculate Final Prediction ---

def calculate_composite_prediction(grid_df):
  """
  Forms the composite grid prediction (y_hat_grid)
  [cite_start]per Equations 15 and 16 [cite: 225-230].
  """
  # 1. Get all adjusted fits
  #   adj_fits = grid_df['adj_fit']
  #
  # [cite_start]2. Calculate reliability weights (psi_theta), Eq. 15 [cite: 228]
  #   Ensure all fits are non-negative, or handle as needed
  #   sum_adj_fits = adj_fits[adj_fits > 0].sum()
  #   psi_weights = adj_fits / sum_adj_fits
  #   psi_weights = psi_weights.fillna(0)
  #
  # 3. Calculate composite prediction (y_hat_grid), Eq. [cite_start]16 [cite: 230]
  #   y_hat_grid = np.sum(psi_weights * grid_df['prediction'])
  #
  #   return y_hat_grid, psi_weights
  pass

# y_hat_grid_0, psi_weights_0 = calculate_composite_prediction(grid_for_task_0)
# print(f"Composite Prediction for Task 0: {y_hat_grid_0}")

# --- Calculate RBI ---

def calculate_rbi_for_task(grid_df, all_variables):
  """
  Calculates RBI for every variable for a single task,
  [cite_start]using the grid results per Equation 18[cite: 375].
  """
  # rbi_scores = {}
  #
  # for var_k in all_variables:
  #   [cite_start]# 1. Find cells *with* var_k (delta_k(theta) = 1) [cite: 375]
  #     includes_k_mask = grid_df['params'].apply(lambda p: var_k in p['vars'])
  #     adj_fit_with_k = grid_df.loc[includes_k_mask, 'adj_fit']
  #
  #   [cite_start]# 2. Find cells *without* var_k (delta_k(theta) = 0) [cite: 375]
  #     excludes_k_mask = ~includes_k_mask
  #     adj_fit_without_k = grid_df.loc[excludes_k_mask, 'adj_fit']
  #
  #   [cite_start]# 3. Calculate weighted average difference (per Eq. 18) [cite: 375, 382]
  #   # This is a simplified interpretation:
  #   # avg_fit_with_k = adj_fit_with_k.mean()
  #   # avg_fit_without_k = adj_fit_without_k.mean()
  #   # rbi_k = avg_fit_with_k - avg_fit_without_k
  #
  #   # NOTE: Eq. 18 is more complex, involving scaling (alpha_theta)
  #   [cite_start]# and specific denominators[cite: 377, 383].
  #   # A full implementation would need to build those weights.
  #
  #   rbi_scores[var_k] = rbi_k
  #
  # return pd.Series(rbi_scores)
  pass

# rbi_scores_0 = calculate_rbi_for_task(grid_for_task_0, predictive_variables)
# print("RBI Scores for Task 0:")
# print(rbi_scores_0.sort_values(ascending=False))

In [None]:
# Cell 8: Batch Processing (All Tasks)

# --- Loop RBP & RBI for all tasks in the test set ---

# results = []
#
# [cite_start]# Iterate over each prediction task in the test set [cite: 539]
# for index, x_t in X_test.iterrows():
#   # 1. Process the grid for this task
#   grid_df = process_grid_for_one_task(x_t, X_train, y_train)
#
#   # 2. Calculate the composite prediction
#   y_hat_grid, _ = calculate_composite_prediction(grid_df)
#
#   # 3. Calculate the RBI scores for this task
#   rbi_scores = calculate_rbi_for_task(grid_df, predictive_variables)
#
#   # 4. Store results
#   results.append({
#       'task_date': index,
#       'y_actual': y_test.loc[index],
#       'y_pred_grid': y_hat_grid,
#       'rbi_scores': rbi_scores
#   })
#
# # --- Format Results ---
# # results_df = pd.DataFrame(results)
# # rbi_df = pd.DataFrame(
# #     [r['rbi_scores'] for r in results], index=[r['task_date'] for r in results]
# # )
#
# print("Batch processing complete.")
# print("RBI DataFrame Head:")
# print(rbi_df.head())

In [None]:
# Cell 9: Comparison: Linear Regression (t-statistic)

# --- Run a standard OLS regression on the training data ---
# [cite_start]This is to get the t-statistics for comparison[cite: 417].

def get_ols_t_stats(X_train, y_train):
  """
  Runs a single OLS regression and returns the t-statistics
  for all variables.
  """
  # 1. Add a constant (intercept) to the model
  #   X_with_const = sm.add_constant(X_train)
  #
  # 2. Fit the OLS model
  #   model = sm.OLS(y_train, X_with_const).fit()
  #
  # [cite_start]3. Extract t-statistics (and drop the 'const' row) [cite: 419]
  #   t_stats = model.tvalues.drop('const', errors='ignore')
  #
  # [cite_start]4. Return the absolute t-stats for ranking [cite: 547]
  #   return t_stats.abs().sort_values(ascending=False)
  pass

# ols_t_stats = get_ols_t_stats(X_train, y_train)
# print("--- OLS t-statistics (Absolute) ---")
# print(ols_t_stats)

In [None]:
# Cell 10: Analysis & Visualization

# [cite_start]--- 1. Aggregate Importance Comparison (like Exhibit 7) [cite: 554] ---
# We need to calculate the informativeness-weighted average RBI
# (the tau-statistic) [cite_start]across all tasks [cite: 437-438].

def calculate_informativeness(X, x_mean, inv_cov):
  [cite_start]"""Calculates info(x, x-bar) for all rows in X, per Eq. 3/4 [cite: 143-144]."""
  # 1. Iterate over X, call calculate_mahalanobis_distance(x, x_mean, inv_cov)
  # 2. Return a Series of info scores
  pass

# --- Calculate info-weighted RBI ---
# info_scores_test = calculate_informativeness(
#     X_test, full_train_stats['x_mean'], full_train_stats['inv_cov']
# )
# info_weights = info_scores_test / info_scores_test.sum()
# average_rbi = rbi_df.multiply(info_weights, axis=0).sum()

# --- Create comparison table ---
# comparison_df = pd.DataFrame({
#     't_statistic': ols_t_stats,
#     'info_weighted_rbi': average_rbi
# })
# print("--- Aggregate Importance Comparison ---")
# print(comparison_df.sort_values(by='info_weighted_rbi', ascending=False))


# [cite_start]--- 2. Prediction-Specific RBI Heatmap (like Exhibit 8 & 10) [cite: 559, 612] ---
# This is the key visualization showing RBI's power.

# plt.figure(figsize=(20, 8))
# # We must normalize (standardize) the RBI scores *across time* (by row)
# # or *across variables* (by column) to make the colors comparable.
# # Let's standardize by variable (column-wise)
#
# # rbi_normalized = (rbi_df - rbi_df.mean()) / rbi_df.std()
#
# # sns.heatmap(
# #     rbi_normalized.transpose(),
# [cite_start]#     cmap='vlag', # 'vlag' is a good red-white-blue colormap [cite: 559]
# #     center=0
# # )
# # plt.title("Prediction-Specific Variable Importance (RBI) Heatmap")
# # plt.xlabel("Prediction Task (Date)")
# # plt.ylabel("Predictive Variable")
# # plt.show()