# Imports and loading data

In [8]:
import pandas as pd
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

# Import the utility function for one-time directory change
from src.utilities.path_utilities import ensure_parent_dir

# Change to parent directory (only happens once even if cell is re-executed)
ensure_parent_dir('notebook_setup')

Directory already changed to: /Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon


'/Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon'

In [9]:
df = pd.read_csv("data/y_and_x_ready_for_training.csv")

# Train Quantile Regression Models

Using the `train_and_evaluate_quantile_models` function from our modelling module to train and evaluate quantile regressors.

In [10]:
from src.modelling.quantile_models import train_and_evaluate_quantile_models

# Train and evaluate the models
models, results_df, metrics = train_and_evaluate_quantile_models(
    df,
    key_column_name='key',
    target_column_name='target',
    test_size=0.1,
    random_state=42,
    quantiles=[0.1, 0.3, 0.7],
    alpha=0.01,
    solver='highs',
    verbose=True
)

Training 10.0% percentile model...
Training 30.0% percentile model...
Training 30.0% percentile model...
Training 70.0% percentile model...
Training 70.0% percentile model...
Making predictions on whole dataset...

Results Summary:
Number of samples (whole dataset): 34906
Average prediction interval width (10th-70th): 0.9078
Average prediction interval width (30th-70th): 0.6075

First 10 predictions:
                key  actual_target  prediction_10th_percentile  \
0  -10223.0_-1977.0            7.0                   -0.704581   
1  -10423.0_-2077.0          733.0                   -0.704532   
2    -5611.0_2991.0            1.0                   -0.704483   
3    -5614.0_3013.0            1.0                   -0.704434   
4    -5616.0_2982.0            1.0                   -0.704386   
5    -5619.0_3008.0            1.0                   -0.704337   
6    -5621.0_2973.0            1.0                   -0.704288   
7    -5641.0_3007.0            1.0                   -0.704239   
8 

# Calculate Percentile Scores

Use the `calculate_percentile_score` function to score each record based on where the actual target value falls relative to the predicted percentiles.

In [11]:
from src.modelling.quantile_models import calculate_percentile_score

# Calculate scores based on percentile positions
result = calculate_percentile_score(results_df)
print("Test Results:")
print(result)
print("\nExplanation:")
print("Score 0: If actual_target > prediction_70th_percentile (worse than expected)")
print("Score 1: If prediction_30th_percentile < actual_target <= prediction_70th_percentile")
print("Score 2: If prediction_10th_percentile < actual_target <= prediction_30th_percentile")
print("Score 3: If actual_target <= prediction_10th_percentile (better than expected)")

Test Results:
                    key  score_where_worst_is_0
0      -10223.0_-1977.0                       0
1      -10423.0_-2077.0                       0
2        -5611.0_2991.0                       1
3        -5614.0_3013.0                       1
4        -5616.0_2982.0                       1
...                 ...                     ...
34901    -7583.0_3211.0                       2
34902    -7597.0_3208.0                       2
34903    -7609.0_2826.0                       2
34904    -8223.0_-977.0                       0
34905    14177.0_-877.0                       3

[34906 rows x 2 columns]

Explanation:
Score 0: If actual_target > prediction_70th_percentile (worse than expected)
Score 1: If prediction_30th_percentile < actual_target <= prediction_70th_percentile
Score 2: If prediction_10th_percentile < actual_target <= prediction_30th_percentile
Score 3: If actual_target <= prediction_10th_percentile (better than expected)


In [12]:
# Save scores to CSV file
result.to_csv("data/predictions_with_scores.csv", index=False)

# Analyze Scores


In [13]:
# Find cases where actual target is below the 10th percentile prediction
results_df[(results_df['actual_target']) < (results_df['prediction_10th_percentile'])]

Unnamed: 0,key,actual_target,prediction_10th_percentile,prediction_30th_percentile,prediction_70th_percentile,prediction_interval_width_10_70,prediction_interval_width_30_70
10785,-6212.0_3147.0,0.0,0.057032,0.969024,4.397999,4.340968,3.428975
10935,-6214.0_3152.0,0.0,0.229739,1.869849,3.828069,3.598330,1.958219
11079,-6216.0_3152.0,0.0,0.448602,1.756191,4.565056,4.116454,2.808865
11147,-6217.0_3145.0,0.0,0.013632,0.480733,1.802388,1.788756,1.321655
11213,-6218.0_3145.0,0.0,0.435628,0.976590,3.238166,2.802539,2.261576
...,...,...,...,...,...,...,...
22621,-6568.0_3160.0,0.0,0.400176,0.649309,1.000000,0.599824,0.350691
22622,-6568.0_3166.0,0.0,0.400225,0.650117,1.000000,0.599775,0.349883
22623,-6568.0_3168.0,0.0,0.400273,0.650156,1.000000,0.599727,0.349844
22763,-6576.0_3167.0,0.0,0.407111,0.655603,1.000000,0.592889,0.344397


In [14]:
# Full results dataframe
results_df

Unnamed: 0,key,actual_target,prediction_10th_percentile,prediction_30th_percentile,prediction_70th_percentile,prediction_interval_width_10_70,prediction_interval_width_30_70
0,-10223.0_-1977.0,7.0,-0.704581,-0.230117,1.0,1.704581e+00,1.230117
1,-10423.0_-2077.0,733.0,-0.704532,-0.230078,1.0,1.704532e+00,1.230078
2,-5611.0_2991.0,1.0,-0.704483,-0.230039,1.0,1.704483e+00,1.230039
3,-5614.0_3013.0,1.0,-0.704434,-0.230000,1.0,1.704434e+00,1.230000
4,-5616.0_2982.0,1.0,-0.704386,-0.229961,1.0,1.704386e+00,1.229961
...,...,...,...,...,...,...,...
34901,-7583.0_3211.0,1.0,0.999902,1.127899,1.0,9.767533e-05,-0.127899
34902,-7597.0_3208.0,1.0,0.999951,1.127938,1.0,4.883766e-05,-0.127938
34903,-7609.0_2826.0,1.0,1.000000,1.127977,1.0,1.110223e-16,-0.127977
34904,-8223.0_-977.0,2.0,1.000049,1.128016,1.0,-4.883766e-05,-0.128016
