Comparing several methods for creating out of sample predictive intervals

Methods:
- Quantile of residuals
- Quantile regression
- Jacknife+
- Bayesian regression


In [1]:
%load_ext autoreload
%autoreload 2

### Import python modules

In [10]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from copy import deepcopy as dcp

sys.path.append("..")

### Import predictive interval modules

In [3]:
# Import data simulator
from predictive_intervals.data_simulator import DataSimulator

# Import models
from predictive_intervals.models.predictive_intervals_model import PredictiveIntervalModel
from predictive_intervals.models.linear_regression import LinearModel
from predictive_intervals.models.quantile_regression import QuantileRegression
from predictive_intervals.models.jacknife_plus_regression import JacknifePlus
from predictive_intervals.models.bayesian_regression import BayesianRegression

Define data simulator parameters

In [4]:
# Number of data points to sample
n_points = 10000
pct_train = 0.5

# Model parameters
sigma = 3.
alpha = 1.
beta = 2.

Simulate data

In [5]:
data_sim = DataSimulator.generate_lm_data(n_points=n_points,
                                          pct_train=pct_train,
                                          alpha=alpha,
                                          beta=beta,
                                          sigma=np.sqrt(sigma))

In [None]:

lm_pred_ints = lm.get_predictive_intervals(data=data_sim)

In [None]:

aux = qr.get_predictive_intervals(data=data_sim)

In [None]:

aux = jknf.get_predictive_intervals(data=data_sim, pct_sample=0.1)

In [None]:

ints = br.get_predictive_intervals(data_sim)

# Run Experiments

Run few repeated experiments, time them and collects statistics

In [None]:
n_experiments = 30 # or as Andrew Gelman would say, in statistics, 30 = infinity

Set desired level of the predictive interval and instantiate all models

In [6]:
alpha = 0.1

In [7]:
linear_regression = LinearModel(alpha=alpha)
quantile_regression = QuantileRegression(alpha=alpha)
jacknife_plus_regression = JacknifePlus(alpha=alpha)
bayesian_regression = BayesianRegression(alpha=alpha)

methods = [linear_regression, quantile_regression, jacknife_plus_regression, bayesian_regression]

In [14]:
def run_experiment(method: PredictiveIntervalModel, n_experiments: int = 1):
    
    # Declare lists for storing results
    run_times = []
    hit_ratios = []
    interval_lengths = []
    
    # Loop over number of experiments, measure time and collect statistics
    for i in range(n_experiments):
        
        # Single run of an experiment
        start_time = time.time()
        method.get_predictive_intervals(data=data_sim)
        end_time = time.time()
        
        # Append results
        run_times.append(end_time - start_time)
        hit_ratios.append(method.hit_ratio)
        interval_lengths.append(method.avg_length)
        
    # Put all results into dataframe
    results = pd.DataFrame(list(zip(run_times, hit_ratios, interval_lengths)), columns=['time', 'hit_ratio', 'int_length'])
    
    return results

In [None]:
for method in methods:
    run_experiment(method=method, n_experiments=n_experiments)

In [15]:
aux = run_experiment(method=linear_regression, n_experiments=10)

In [16]:
aux

Unnamed: 0,time,hit_ratio,int_length
0,0.000627,0.9,5.833328
1,0.000466,0.9,5.833328
2,0.000425,0.9,5.833328
3,0.000373,0.9,5.833328
4,0.000308,0.9,5.833328
5,0.000256,0.9,5.833328
6,0.000247,0.9,5.833328
7,0.000319,0.9,5.833328
8,0.000332,0.9,5.833328
9,0.000382,0.9,5.833328


In [None]:
    qr_pred_ints = quantile_regression.get_predictive_intervals(data=data_sim)
    jr_pred_ints = jacknife_plus_regression.get_predictive_intervals(data=data_sim)
    br_pred_ints = bayesian_regression.get_predictive_intervals(data=data_sim)