# Example Experiment and Results
Here we provide an example experiment, testing the underspecification indexes and prediction performance on the Bike Share dataset for different dataset lengths.

Information on the Bike Share dataset can be found here: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Due to the size of the dataset, the underspecification index calculations take a reasonable amount of time to compute, as such we recommend this notebook be used as a tool for understanding rather than a runnable notebook. 

In [16]:
import pandas as pd
import numpy  as np

from sklearn.model_selection import train_test_split

from underspecification_index.experiment_control import compare_datasets, check_average_performance

from numba import typed

import altair as alt

## Read and prepare dataset

In [21]:
df = pd.read_csv("Datasets/hour.csv")

# Convert all catergorical features to numeric form
for col in df.columns:
    if not np.issubdtype(df[col].dtype, np.number):
        df[col] = pd.Categorical(df[col])
        df[col] = df[col].cat.codes

# Drop instances with incomplete values
prior_len = len(df)

df = df.dropna()

print(f"Dropped {prior_len - len(df)} incomplete instances from dataset")

df

Dropped 0 incomplete instances from dataset


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,0,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,0,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,0,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,0,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,730,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,730,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,730,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,730,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


## Split dataset

In [23]:
# Create train and test sets 
train, test = train_test_split(df, test_size=0.2)

x_trn = train.drop(columns=["cnt"]).to_numpy()
y_trn = train["cnt"].to_numpy()

x_tst = test.drop(columns=["cnt"]).to_numpy()
y_tst = test["cnt"].to_numpy()

# Split training and test sets into varying lengths
dataset_lengths = [10, 20, 50, 100, 500, 1000, 5000, 10000, 13900]

x_trns = []
y_trns = []

for i in dataset_lengths:
    x_trns += [x_trn[:i]]
    y_trns += [y_trn[:i]]

## Test Average Performance

In [24]:
avg_perfs = check_average_performance(x_trns, y_trns,
                                      x_tst, y_tst,
                                      model_count=100, tree_count=10,regressor=True)

for i in range(len(avg_perfs)):
    print(f"""
            Dataset length: {dataset_lengths[i]:03}
            Average Error : {avg_perfs[i]:.2f}"""
         )


            Dataset length: 010
            Average Error : 15758.09

            Dataset length: 020
            Average Error : 7918.47

            Dataset length: 050
            Average Error : 4506.18

            Dataset length: 100
            Average Error : 2905.41

            Dataset length: 500
            Average Error : 282.88

            Dataset length: 1000
            Average Error : 132.06

            Dataset length: 5000
            Average Error : 36.28

            Dataset length: 10000
            Average Error : 19.64

            Dataset length: 13900
            Average Error : 14.26


## Calculate Underspecification Indexes

In [25]:
metrics = typed.List(["e_dis", "c_sim", "p_cor", "k_tau"])

compare_datasets(x_trns, y_trns, x_tst, y_tst, avg_perfs,
                 metrics, model_count=100, tree_count=10,
                 save_name="bike", regressor=True)

Generating explanation matrix for dataset of length 10 with theta 15758.092743411964
Computing local indexes for dataset of length 10
Generating explanation matrix for dataset of length 20 with theta 7918.471478078255
Computing local indexes for dataset of length 20
Generating explanation matrix for dataset of length 50 with theta 4506.179991858455
Computing local indexes for dataset of length 50
Generating explanation matrix for dataset of length 100 with theta 2905.4082489067905
Computing local indexes for dataset of length 100
Generating explanation matrix for dataset of length 500 with theta 282.8811924913694
Computing local indexes for dataset of length 500
Generating explanation matrix for dataset of length 1000 with theta 132.055317520138
Computing local indexes for dataset of length 1000
Generating explanation matrix for dataset of length 5000 with theta 36.27817557537399
Computing local indexes for dataset of length 5000
Generating explanation matrix for dataset of length 1000

{10: array([59.13875822,  0.67872641,  0.66878052,  0.1661612 ]),
 20: array([38.62908181,  0.88815712,  0.88533446,  0.22534523]),
 50: array([20.98761745,  0.97033128,  0.96994659,  0.22646358]),
 100: array([13.0628638 ,  0.98584461,  0.98592082,  0.28330647]),
 500: array([7.41809813, 0.99233734, 0.99239274, 0.28920584]),
 1000: array([5.21722303, 0.99541034, 0.99540618, 0.28272132]),
 5000: array([2.43360286, 0.99907755, 0.99906377, 0.28600466]),
 10000: array([1.82815793, 0.99912292, 0.9991001 , 0.27343043]),
 13900: array([1.59946899, 0.99925493, 0.99922529, 0.27190138])}

# Visualising Results

## Read and Prepare Results

In [26]:
results = pd.read_csv("bike_Indexes.csv")

# Average over strata
results = results.groupby("trn_length").mean().reset_index()

# Change Column Names
results.columns = ["trn_length",
                   "Euclidean Distance",
                   "Cosine Similarity",
                   "Pearson Correlation",
                   "Kendall Rank Correlation",
                   "Prediction Variance",
                   "Prediction Accuracy"]

results

Unnamed: 0,trn_length,Euclidean Distance,Cosine Similarity,Pearson Correlation,Kendall Rank Correlation,Prediction Variance,Prediction Accuracy
0,10,59.138758,0.678726,0.668781,0.166161,1545.702911,12463.370387
1,20,38.629082,0.888157,0.885334,0.225345,612.905745,6723.725159
2,50,20.987617,0.970331,0.969947,0.226464,255.697758,4105.581962
3,100,13.062864,0.985845,0.985921,0.283306,139.519656,2734.179916
4,500,7.418098,0.992337,0.992393,0.289206,63.221843,251.131198
5,1000,5.217223,0.99541,0.995406,0.282721,30.886766,123.538453
6,5000,2.433603,0.999078,0.999064,0.286005,8.623867,33.884791
7,10000,1.828158,0.999123,0.9991,0.27343,4.502162,18.630723
8,13900,1.599469,0.999255,0.999225,0.271901,3.415643,13.552486


## Vis Functions

## $U_X$ and Prediction Accuracy Against Training Length

In [27]:
# Accuracy against metric as in PRICAI
def metric_pred_acc(df,metric,direct=False,regressor=False):
    
    metric_title = f"{metric} Based Index"
    
    corr = df["Prediction Accuracy"].corr(df[metric])
    
    base = alt.Chart(df).encode(
        x=alt.X("trn_length:O",
                axis=alt.Axis(title="Training length"))
    )
    
    metric_bars = base.mark_bar(color="blue",opacity=0.4).encode(
        y=alt.Y(metric,
                scale=alt.Scale(zero=False),
                axis=alt.Axis(title=metric_title,titleColor="blue")
               ),
    )
    
    if regressor:
        y_title = "Prediction Mean Squared Error"
    else:
        y_title = "Prediction Accuracy"
    
    acc_line = base.mark_line(color="red",opacity=0.7).encode(
        y=alt.Y("Prediction Accuracy:Q",
                scale=alt.Scale(zero=False),
                axis=alt.Axis(titleColor="red"),
                title=y_title
               ),
    )
    
    ccef_text = base.mark_text(
        align="left", baseline="top", size=25
    ).encode(
        x=alt.value(10),
        y=alt.value(10),
        text=alt.value(f"r: {corr:.3f}"),
    )
    
    bar_text = alt.layer(metric_bars,ccef_text)
    
    chart = alt.layer(bar_text,acc_line).resolve_scale(
        y = 'independent'
    ).properties(
        width=1200,
        height=600
    )
    
    if direct:
        chart = chart.configure_axis(
                        labelFontSize=20,
                        titleFontSize=20
                    )
    
    return chart

In [31]:
metric_pred_acc(results,"Cosine Similarity",True,True)

## Layered Chart for all Metrics

In [32]:
# Layered graph as above for each metric
def agreement_metrics(df):
    
    e_d = metric_pred_acc(df,"Euclidean Distance")
    c_s = metric_pred_acc(df,"Cosine Similarity")
    p_c = metric_pred_acc(df,"Pearson Correlation")
    k_t = metric_pred_acc(df,"Kendall Rank Correlation")
    
    agreement = alt.vconcat(e_d,c_s,p_c,k_t).configure_axis(
            labelFontSize=20,
            titleFontSize=20
    )
    
    return agreement

In [33]:
agreement_metrics(results)