# Imports and loading data

In [9]:
import pandas as pd
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

# Import the utility function for one-time directory change
from src.utilities.path_utilities import ensure_parent_dir

# Change to parent directory (only happens once even if cell is re-executed)
ensure_parent_dir('notebook_setup')

Directory already changed to: /Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon


'/Users/adrianhajdukiewicz/projects/private/2025_data_biz_hackathon'

In [10]:
df = pd.read_csv("data/y_and_x_ready_for_training.csv")

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import QuantileRegressor

# Assuming you already have your dataframe 'df' loaded
# Save the 'key' column for later merging
key_column = df['key'].copy()

# Drop the 'key' column from features
df_features = df.drop(['key', 'target'], axis=1)
target = df['target']

# Split the data into training and testing sets (90/10 split) - keeping for model training
X_train, X_test, y_train, y_test = train_test_split(
    df_features, target, test_size=0.1, random_state=42
)
X_test=df_features
y_test=target

# Create three QuantileRegressor models
# Model 1: 10th percentile (quantile=0.1)
model_10th = QuantileRegressor(quantile=0.1, alpha=0.01, solver='highs')

# Model 2: 30th percentile (quantile=0.2)
model_30th = QuantileRegressor(quantile=0.3, alpha=0.01, solver='highs')

# Model 3: 70th percentile (quantile=0.8)
model_70th = QuantileRegressor(quantile=0.7, alpha=0.01, solver='highs')

# Train all three models on training data
print("Training 10th percentile model...")
model_10th.fit(X_train, y_train)

print("Training 30th percentile model...")
model_30th.fit(X_train, y_train)

print("Training 70th percentile model...")
model_70th.fit(X_train, y_train)

# Make predictions on the WHOLE dataset (100%)
print("Making predictions on whole dataset...")
predictions_10th_full = model_10th.predict(df_features)
predictions_30th_full = model_30th.predict(df_features)
predictions_70th_full = model_70th.predict(df_features)

# Create results dataframe with predictions for the whole dataset
results_df = pd.DataFrame({
    'key': key_column,
    'actual_target': target,
    'prediction_10th_percentile': predictions_10th_full,
    'prediction_30th_percentile': predictions_30th_full,
    'prediction_70th_percentile': predictions_70th_full
})

# Calculate prediction interval widths
results_df['prediction_interval_width_10_80'] = (
    results_df['prediction_70th_percentile'] - results_df['prediction_10th_percentile']
)
results_df['prediction_interval_width_20_80'] = (
    results_df['prediction_70th_percentile'] - results_df['prediction_30th_percentile']
)

# Display results
print("\nResults Summary:")
print(f"Number of samples (whole dataset): {len(results_df)}")
print(f"Average prediction interval width (10th-70th): {results_df['prediction_interval_width_10_80'].mean():.4f}")
print(f"Average prediction interval width (30th-70th): {results_df['prediction_interval_width_20_80'].mean():.4f}")
print("\nFirst 10 predictions:")
print(results_df.head(10))

# Optional: Evaluate model performance on whole dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error

print("\nModel Performance (whole dataset):")
print(f"10th percentile MAE: {mean_absolute_error(target, predictions_10th_full):.4f}")
print(f"30th percentile MAE: {mean_absolute_error(target, predictions_30th_full):.4f}")
print(f"70th percentile MAE: {mean_absolute_error(target, predictions_70th_full):.4f}")
print(f"10th percentile RMSE: {np.sqrt(mean_squared_error(target, predictions_10th_full)):.4f}")
print(f"30th percentile RMSE: {np.sqrt(mean_squared_error(target, predictions_30th_full)):.4f}")
print(f"70th percentile RMSE: {np.sqrt(mean_squared_error(target, predictions_70th_full)):.4f}")

# Check coverage for both intervals on whole dataset
within_interval_10_80 = (
    (results_df['actual_target'] >= results_df['prediction_10th_percentile']) & 
    (results_df['actual_target'] <= results_df['prediction_70th_percentile'])
)
within_interval_20_80 = (
    (results_df['actual_target'] >= results_df['prediction_30th_percentile']) & 
    (results_df['actual_target'] <= results_df['prediction_70th_percentile'])
)

coverage_10_80 = within_interval_10_80.mean()
coverage_20_80 = within_interval_20_80.mean()

print(f"\nPrediction interval coverage (10th-70th): {coverage_10_80:.2%}")
print("(Expected coverage should be around 70% for 10th-70th percentile interval)")
print(f"Prediction interval coverage (30th-70th): {coverage_20_80:.2%}")
print("(Expected coverage should be around 60% for 30th-70th percentile interval)")

# Additional: Show performance on test set for comparison
print("\n" + "="*50)
print("COMPARISON: Performance on test set only:")
predictions_10th_test = model_10th.predict(X_test)
predictions_30th_test = model_30th.predict(X_test)
predictions_70th_test = model_70th.predict(X_test)

print(f"Test set size: {len(X_test)}")
print(f"10th percentile MAE (test): {mean_absolute_error(y_test, predictions_10th_test):.4f}")
print(f"30th percentile MAE (test): {mean_absolute_error(y_test, predictions_30th_test):.4f}")
print(f"70th percentile MAE (test): {mean_absolute_error(y_test, predictions_70th_test):.4f}")

Training 10th percentile model...
Training 30th percentile model...
Training 70th percentile model...
Making predictions on whole dataset...

Results Summary:
Number of samples (whole dataset): 34907
Average prediction interval width (10th-70th): 0.9037
Average prediction interval width (30th-70th): 0.6022

First 10 predictions:
                key  actual_target  prediction_10th_percentile  \
0  -10217.0_-1987.0            7.0                   -0.701228   
1  -10417.0_-2087.0          733.0                   -0.701180   
2    -5605.0_2981.0            1.0                   -0.701131   
3    -5608.0_3003.0            1.0                   -0.701082   
4    -5610.0_2972.0            1.0                   -0.701033   
5    -5613.0_2998.0            1.0                   -0.700985   
6    -5615.0_2963.0            1.0                   -0.700936   
7    -5635.0_2997.0            1.0                   -0.700887   
8    -5636.0_2993.0            2.0                   -0.700838   
9    -563

In [16]:
import pandas as pd
import numpy as np

def calculate_percentile_score(df):
    """
    Calculate a score based on actual_target position relative to prediction percentiles.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame with columns:
        - key
        - actual_target
        - prediction_10th_percentile
        - prediction_30th_percentile
        - prediction_70th_percentile
    
    Returns:
    pd.DataFrame: DataFrame with columns 'key' and 'score_where_worst_is_0'
    """
    # Create a copy to avoid modifying the original DataFrame
    result_df = df[['key']].copy()
    
    # Define conditions in order (most restrictive first)
    conditions = [
        df['actual_target'] > df['prediction_70th_percentile'],  # score = 0
        df['actual_target'] > df['prediction_30th_percentile'],  # score = 1
        df['actual_target'] > df['prediction_10th_percentile'],  # score = 2
    ]
    
    # Corresponding scores for each condition
    choices = [0, 1, 2]
    
    # Apply the scoring logic using numpy.select
    # Default value (else case) is 3
    result_df['score_where_worst_is_0'] = np.select(conditions, choices, default=3)
    
    return result_df




df = pd.DataFrame(results_df)
result = calculate_percentile_score(df)
print("Test Results:")
print(result)
print("\nExplanation:")
print("A: 5 <= 10 (10th percentile) → score = 3")
print("B: 10 < 15 <= 20 (between 10th and 30th) → score = 2")
print("C: 20 < 25 <= 30 (between 30th and 70th) → score = 1")
print("D: 35 > 30 (above 70th percentile) → score = 0")
print("E: 45 > 30 (above 70th percentile) → score = 0")


Test Results:
                    key  score_where_worst_is_0
0      -10217.0_-1987.0                       0
1      -10417.0_-2087.0                       0
2        -5605.0_2981.0                       1
3        -5608.0_3003.0                       1
4        -5610.0_2972.0                       1
...                 ...                     ...
34902    -7577.0_3201.0                       3
34903    -7591.0_3198.0                       3
34904    -7603.0_2816.0                       3
34905    -8217.0_-987.0                       0
34906    14183.0_-887.0                       3

[34907 rows x 2 columns]

Explanation:
A: 5 <= 10 (10th percentile) → score = 3
B: 10 < 15 <= 20 (between 10th and 30th) → score = 2
C: 20 < 25 <= 30 (between 30th and 70th) → score = 1
D: 35 > 30 (above 70th percentile) → score = 0
E: 45 > 30 (above 70th percentile) → score = 0


In [17]:
result.to_csv("data/predictions_with_scores.csv", index=False)

In [18]:
results_df[(results_df['actual_target']) < (results_df['prediction_10th_percentile'])]

Unnamed: 0,key,actual_target,prediction_10th_percentile,prediction_30th_percentile,prediction_70th_percentile,prediction_interval_width_10_80,prediction_interval_width_20_80
10785,-6206.0_3137.0,0.0,0.063112,0.931757,4.285102,4.221989,3.353345
10935,-6208.0_3142.0,0.0,0.253602,2.061796,4.029433,3.775831,1.967636
11079,-6210.0_3142.0,0.0,0.455135,1.480888,4.127348,3.672214,2.646460
11147,-6211.0_3135.0,0.0,0.014406,0.295202,1.473461,1.459055,1.178258
11213,-6212.0_3135.0,0.0,0.433519,0.401342,2.172367,1.738848,1.771025
...,...,...,...,...,...,...,...
34901,-7576.0_3201.0,1.0,1.000097,1.128429,1.000000,-0.000097,-0.128429
34902,-7577.0_3201.0,1.0,1.000146,1.128467,1.000000,-0.000146,-0.128467
34903,-7591.0_3198.0,1.0,1.000195,1.128506,1.000000,-0.000195,-0.128506
34904,-7603.0_2816.0,1.0,1.000244,1.128545,1.000000,-0.000244,-0.128545


In [None]:
results_df

Unnamed: 0,key,actual_target,prediction_10th_percentile,prediction_30th_percentile,prediction_70th_percentile,prediction_interval_width_10_80,prediction_interval_width_20_80
24279,-6626.0_3291.0,1.0,0.482305,0.715169,1.000000,0.517695,0.284831
15902,-6294.0_3100.0,0.0,0.065857,0.301405,0.750000,0.684143,0.448595
21307,-6509.0_3091.0,2.0,0.337428,0.599541,1.000000,0.662572,0.400459
15308,-6281.0_3124.0,0.0,0.044994,0.366144,1.000000,0.955006,0.633856
12114,-6225.0_3114.0,2.0,-0.110705,0.340738,1.515245,1.625950,1.174507
...,...,...,...,...,...,...,...
10544,-6203.0_3122.0,1.0,-0.195330,0.092946,0.750000,0.945330,0.657054
25673,-6686.0_3348.0,7.0,0.550258,0.769404,1.000000,0.449742,0.230596
756,-5821.0_3029.0,1.0,-0.664376,-0.200016,1.000000,1.664376,1.200016
7049,-6138.0_3122.0,0.0,-0.378619,-0.178682,0.578555,0.957174,0.757237
