In [16]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import math


In [17]:
# Example usage:
# Load your data
df = pd.read_csv('../training_data.csv')

In [18]:
def create_numeric_percentile_bins(df, column_name, num_bins=4):
    """
    Create percentile bins with ascending numeric codes (1, 2, 3, 4)
    """
    # Create percentile bins and assign numeric labels
    binned_column = pd.qcut(df[column_name], q=num_bins, labels=range(1, num_bins + 1), duplicates='drop')
    
    # Get the actual bin edges for reference
    _, bin_edges = pd.qcut(df[column_name], q=num_bins, retbins=True, duplicates='drop')
    
    return binned_column.astype(int), bin_edges

variables_to_bin = [f'sensor_{i}' for i in range(8)] + ['goal_direction']

for var in variables_to_bin:
    # Create numeric bins
    binned_col, edges = create_numeric_percentile_bins(df, var, num_bins=4)
    
    # Add the binned column to dataframe
    df[f'{var}_Q'] = binned_col
    
    # Print bin information
    print(f"\n{var}_Q:")
    print(f"  Overall range: {df[var].min():.2f} to {df[var].max():.2f}")
    print(f"  Quartile boundaries and coding:")
    
    for i in range(len(edges) - 1):
        quartile_num = i + 1
        start_val = edges[i]
        end_val = edges[i + 1]
        count = (df[f'{var}_Q'] == quartile_num).sum()
        percentage = count / len(df) * 100
        
        print(f"    {quartile_num}: {start_val:8.2f} to {end_val:8.2f} | {count:,} obs ({percentage:.1f}%)")
    
    # Show the numeric distribution
    print(f"  Value counts: {dict(df[f'{var}_Q'].value_counts().sort_index())}")


sensor_0_Q:
  Overall range: 1.00 to 20.00
  Quartile boundaries and coding:
    1:     1.00 to     3.00 | 12,065 obs (34.0%)
    2:     3.00 to     5.00 | 6,287 obs (17.7%)
    3:     5.00 to     9.00 | 9,496 obs (26.8%)
    4:     9.00 to    20.00 | 7,604 obs (21.4%)
  Value counts: {1: np.int64(12065), 2: np.int64(6287), 3: np.int64(9496), 4: np.int64(7604)}

sensor_1_Q:
  Overall range: 1.00 to 20.00
  Quartile boundaries and coding:
    1:     1.00 to     3.00 | 12,854 obs (36.3%)
    2:     3.00 to     5.00 | 7,741 obs (21.8%)
    3:     5.00 to     7.00 | 6,106 obs (17.2%)
    4:     7.00 to    20.00 | 8,751 obs (24.7%)
  Value counts: {1: np.int64(12854), 2: np.int64(7741), 3: np.int64(6106), 4: np.int64(8751)}

sensor_2_Q:
  Overall range: 1.00 to 20.00
  Quartile boundaries and coding:
    1:     1.00 to     3.00 | 10,749 obs (30.3%)
    2:     3.00 to     6.00 | 9,153 obs (25.8%)
    3:     6.00 to     9.00 | 7,065 obs (19.9%)
    4:     9.00 to    20.00 | 8,485 obs (23.9%)

In [23]:
unique_patterns = df.groupby(['sensor_0_Q', 'sensor_1_Q', 'sensor_2_Q', 'sensor_3_Q', 'sensor_4_Q', 'sensor_5_Q', 'sensor_6_Q', 'sensor_7_Q'])['action'].nunique()
# Without goal_direction: unique sensor patterns
unique_sensor_patterns = df.groupby(['sensor_0_Q', 'sensor_1_Q', 'sensor_2_Q', 'sensor_3_Q', 
                                    'sensor_4_Q', 'sensor_5_Q', 'sensor_6_Q', 'sensor_7_Q'])['action'].nunique()

print("Unique sensor patterns (without goal_direction):", len(unique_sensor_patterns))

# With goal_direction: unique sensor + goal_direction patterns  
unique_patterns_with_goal = df.groupby(['sensor_0_Q', 'sensor_1_Q', 'sensor_2_Q', 'sensor_3_Q', 
                                       'sensor_4_Q', 'sensor_5_Q', 'sensor_6_Q', 'sensor_7_Q', 
                                       'goal_direction_Q'])['action'].nunique()

print("Unique patterns (with goal_direction):", len(unique_patterns_with_goal))

# Compare label consistency
conflicted_without_goal = (unique_sensor_patterns > 1).sum()
conflicted_with_goal = (unique_patterns_with_goal > 1).sum()

print(f"\nLabel conflicts:")
print(f"Without goal_direction: {conflicted_without_goal}/{len(unique_sensor_patterns)} patterns have multiple actions ({conflicted_without_goal/len(unique_sensor_patterns):.1%})")
print(f"With goal_direction: {conflicted_with_goal}/{len(unique_patterns_with_goal)} patterns have multiple actions ({conflicted_with_goal/len(unique_patterns_with_goal):.1%})")



Unique sensor patterns (without goal_direction): 21128
Unique patterns (with goal_direction): 29213

Label conflicts:
Without goal_direction: 6700/21128 patterns have multiple actions (31.7%)
With goal_direction: 2568/29213 patterns have multiple actions (8.8%)
number of unique patterns:  21128
