In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import shap
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import plotly.express as px



from catboost import CatBoostRegressor

import random
random.seed(42)  # For reproducibility

In [None]:
folder_path = 'data/horse-tracking-data'
derby_path = 'data/big-data-derby-2022'
# List to store filenames and paths
def get_files(folder_path):
    file_list = []
    race_list = []
    # Iterate through the folder
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):  # Check if the file is a CSV
                full_path = os.path.join(root, file)
                file_list.append(full_path)
                race_list.append(file.split('.')[0])

    # print(file_list)
    # print(race_list)
    return file_list, race_list

In [None]:
def train_test_file_split(file_list):
    random.shuffle(file_list)

    split_index = int(0.8 * len(file_list))
    train_files = file_list[:split_index]
    test_files = file_list[split_index:]

    print(f"Total Files : {len(file_list)}")
    print(f"Number of training files: {len(train_files)}")
    print(f"Number of testing files: {len(test_files)}")

    return train_files, test_files

### Initialize Dataframes for Training and Testing

In [None]:
# Initialize DataFrames for train and test
def get_dataframes(train_files, test_files):
    train_df = []
    test_df = []

    # Load and combine training data
    for file in tqdm(train_files):
        data = pd.read_csv(file)  # Adjust the function if needed (e.g., read_parquet for other formats)
        train_df.append(data)

    train_df = pd.concat(train_df, ignore_index=True)

    # Load and combine testing data
    for file in tqdm(test_files):
        data = pd.read_csv(file)
        test_df.append(data)

    test_df = pd.concat(test_df, ignore_index=True)

    # Remove after race columns
    train_df = train_df[train_df['is_race_going']]
    test_df = test_df[test_df['is_race_going']]
    

    print(f"Training data shape: {train_df.shape}")
    print(f"Testing data shape: {test_df.shape}")

    return train_df, test_df

In [None]:
feature_columns = [
       'cumulative_distance_travelled', 'position',

       'distance_to_leader', 'speed_1s','acceleration_1s', 
       
       'speed_1s_lag1', 'speed_1s_lag2', 'speed_1s_lag3', 
       
       'acceleration_1s_lag1','acceleration_1s_lag2', 'acceleration_1s_lag3',

       'distance_to_leader_lag1', 'distance_to_leader_lag2', 'distance_to_leader_lag3',
      
       'remaining_distance', 'leader_remaining_distance']

target = ['target_variable']

In [None]:
def get_X_y_splits(train_df, test_df):
    X_train = train_df[feature_columns]
    y_train = train_df[target]

    X_test = test_df[feature_columns]
    y_test = test_df[target]

    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    return X_train, X_test, y_train, y_test

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()

def scale_data(X_train, X_test, y_train, y_test):
    scaler = StandardScaler()

    # Normalize
    X_train_scaled = scaler.fit_transform(X_train)  # NumPy array
    X_test_scaled = scaler.transform(X_test)

    # Convert back to DataFrame
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    y_train['target_variable'] = y_train['target_variable'].apply(lambda x : 0 if x<0 else 0.999999 if x>=1 else x)
    y_test['target_variable'] = y_test['target_variable'].apply(lambda x : 0 if x<0 else 0.999999 if x>=1 else x)

    # temporary trial
    # multiply t_train target_variable and t_test target_variable by 100
    y_train['target_variable'] *= 100
    y_test['target_variable'] *= 100

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [None]:
feature_columns = [
        'position', #'cumulative_distance_travelled',

        'distance_to_leader',
         'speed_1s','acceleration_1s', 
       
       # 'speed_1s_lag1', 'speed_1s_lag2', 'speed_1s_lag3', 
       
       # 'acceleration_1s_lag1','acceleration_1s_lag2', 'acceleration_1s_lag3',

       # 'distance_to_leader_lag1', 'distance_to_leader_lag2', 'distance_to_leader_lag3',
        # 'remaining_distance',
        'leader_remaining_distance'
        ] 
        
        # 'curve'] # from the updated column

target = ['target_variable']
def fetch_clean_data(folder_path):
    file_list, _ = get_files(folder_path)
    train_files, test_files = train_test_file_split(file_list)
    train_df, test_df = get_dataframes(train_files, test_files)
    X_train, X_test, y_train, y_test = get_X_y_splits(train_df, test_df)
    X_train_scaled, X_test_scaled, y_train, y_test , scalar= scale_data(X_train, X_test, y_train, y_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, scalar


### Model Training

In [None]:
def train_it(model, X_train_scaled, y_train):
    model.fit(X_train_scaled,y_train)
    return model

def predict_this(model, X):
    x_pred = pd.DataFrame()
    x_pred['pred'] = model.predict(X)
    return x_pred['pred'].apply(lambda x : 0 if x<0 else 99.9999 if x>=100 else x)

def get_rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [None]:
def training_arc(model, X_train_scaled, X_test_scaled, y_train, y_test):
    model = train_it(model, X_train_scaled, y_train)
    y_train_pred = predict_this(model, X_train_scaled)
    y_test_pred = predict_this(model, X_test_scaled)

    # rmse
    train_rmse = get_rmse(y_train['target_variable'], y_train_pred)
    test_rmse = get_rmse(y_test['target_variable'], y_test_pred)
    print("Train RMSE: ", train_rmse)
    print("Test RMSE: ", test_rmse)

    return model, train_rmse, test_rmse
    

### Fast Fast

In [None]:
# Get Data
# X_train_scaled, X_test_scaled, y_train, y_test = fetch_clean_data(folder_path)

In [None]:
model = XGBRegressor(n_estimators=100, learning_rate=0.01, random_state=42, max_depth = 5)
# model = CatBoostRegressor(n_estimators=100, learning_rate=0.1, loss_function='rmse', depth= 10, verbose=True)

In [None]:
# Do bernoulli here



def plotter(win_prob, rank_, remain_, race_name, save_folder):

    base_name = os.path.splitext(race_name)[0]  # Removes the file extension
    file_name = f"{base_name}.png"
    file_name = file_name.split("/")[-1]

    save_path = os.path.join(save_folder, file_name)


    fig, axes = plt.subplots(1, 3, figsize=(24, 8), sharex=True, sharey=False)

    df_win = win_prob.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='win_probability')

    # df_win['smooth_win_probability'] = df_win.groupby('program_number')['win_probability'].transform(
    # lambda x: x.rolling(window=5, min_periods=1).mean()
    # )
    # Plot 1: Win Probabilities
    sns.lineplot(
        data=df_win, 
        x='trakus_index', 
        y='win_probability', 
        hue='program_number', 
        ax=axes[0]
    )
    axes[0].set_title("Win Probabilities Over Time")
    axes[0].set_xlabel("Time (Trakus Index)")
    axes[0].set_ylabel("Win Probability")
    axes[0].legend(title="Program Number")

    df_rank = rank_.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='position')

    # Plot 2: Positions
    sns.lineplot(
        data=df_rank, 
        x='trakus_index', 
        y='position', 
        hue='program_number', 
        ax=axes[1]
    )
    axes[1].set_title("Positions Over Time")
    axes[1].set_xlabel("Time (Trakus Index)")
    axes[1].set_ylabel("Position")
    axes[1].legend(title="Program Number")


    df_remaining_dist = remain_.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='distance_to_leader')
    # Plot 3: Distance to Leader
    sns.lineplot(
        data=df_remaining_dist, 
        x='trakus_index', 
        y='distance_to_leader', 
        hue='program_number', 
        ax=axes[2]
    )
    axes[2].set_title("Distance to Leader Over Time")
    axes[2].set_xlabel("Time (Trakus Index)")
    axes[2].set_ylabel("Distance to Leader")
    axes[2].legend(title="Program Number")

    fig.suptitle(race_name)

    # Adjust layout
    # plt.tight_layout()
    # print("Saving image to : ", save_path)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')

    plt.close(fig)

def monte_carlo_horse_race_simulation(remain_, pred_, rank_):
    # Number of Monte Carlo simulations
    n_simulations = 100
    size = 100  # Number of simulations per iteration

    win_prob = pd.DataFrame(index=pred_.index, columns=pred_.columns)
    error_count = 0
    # Monte Carlo simulation loop
    for sim in range(n_simulations):
        # Temporary DataFrame for storing simulated ranks in this iteration
        cur_win_prob = pd.DataFrame(index=range(size), columns=pred_.columns)
        
        # Simulate for each race
        for i, row in pred_.iterrows():
            for fp in pred_.columns:
                # Skip if horse has finished
                if remain_.loc[i][fp] < 0:
                    cur_win_prob[fp] = [1] * size
                    continue
                
                # Poisson distribution for rank differences
                lambda_ = abs(remain_.loc[i][fp] * row[fp])  # Expected value
                
                # Two-step Monte Carlo simulation
                # 1. Poisson distribution for rank differences
                try:
                    fp_rank_diff = np.random.poisson(lambda_, size)
                except Exception as e:
                    error_count +=1
                # 2. Calculate predicted ranks ß(First to finish is the winner)
                pred_rank = rank_.loc[i][fp] - fp_rank_diff
                
                # Store simulation results
                cur_win_prob[fp] = pred_rank
        
            # Aggregate win probabilities for this simulation
            for fp in pred_.columns:
                # Calculate win probability for this Monte Carlo iteration
                if sim == 0:
                    # Initialize on first iteration
                    win_prob.loc[i, fp] = sum(cur_win_prob[fp] == 1) / size
                else:
                    # Accumulate probabilities
                    win_prob.loc[i, fp] += sum(cur_win_prob[fp] == 1) / size

    # Normalize probabilities across all horses
    win_prob = win_prob.divide(n_simulations)
    win_prob = win_prob.divide(win_prob.sum(axis=1), axis=0)    
  
    return win_prob

def monte_carlo_wrapper(race_data):
    # the reace I cwill get here will be after passing it to model already
    race_data.sort_values('trakus_index', inplace=True)
    race_data = race_data.reset_index()

    # convert to string representation
    race_data['program_number'] = race_data['program_number'].astype(str).str.strip()

    # brenoulli now
    size = 1000
    rank_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='position')
    remain_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='leader_remaining_distance')
    pred_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='target_variable')

    win_prob = monte_carlo_horse_race_simulation(remain_, pred_, rank_)
    return win_prob, rank_

def bernoulli_race(race_data):
    # dataframe containing one race worth of data
    # for each trakus index, calculate prob for each hors
    # return dataframe
    
    # the reace I cwill get here will be after passing it to model already
    race_data.sort_values('trakus_index', inplace=True)
    race_data = race_data.reset_index()

    # convert to string representation
    race_data['program_number'] = race_data['program_number'].astype(str).str.strip()

    # brenoulli now
    size = 1000
    rank_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='position')
    remain_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='leader_remaining_distance')
    pred_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='target_variable')

    win_prob = pred_.copy() 

    total_rows = win_prob.shape[0]
    error_rows = 0

    for i, row in pred_.iterrows():
        cur_win_prob = pd.DataFrame(index=range(size))
        for fp in pred_.columns:
            if remain_.loc[i][fp] < 0:
                cur_win_prob[fp] = [1]*1000
                continue
            try:
                fp_rank_diff = np.random.binomial(remain_.loc[i][fp], row[fp], size)
            except Exception as e:
                # print("Error occurred during Bernoulli race calculation for race. , error : " + str(e))
                # print current variable values
                # print(f"Status: Rem : {remain_.loc[i][fp]} || Target: {row[fp]}")
                error_rows += 1
                
            pred_rank = rank_.loc[i][fp] - fp_rank_diff
            cur_win_prob[fp] = pred_rank
        # cur_win_prob = cur_win_prob.rank(method='min', axis=1)
        for fp in pred_.columns:
            win_prob.loc[i, fp] = sum(cur_win_prob[fp]==1) / size

    # norm
    win_prob = win_prob.divide(win_prob.sum(axis=1), axis=0)

    # if error_rows > 0:
        # print(f"Error occurred {error_rows} times during Bernoulli race calculation. ({error_rows/total_rows} %)")

    return win_prob, rank_



def bernoulli_super(file_list, model, scalar,save_folder):
    # multiple files
    # read each file, pass through model to predict target
    # take target df for the race, pass to bernoulli_race to get probabilities for that, store. 

    os.makedirs(save_folder, exist_ok=True)


    for file in tqdm(file_list):
        # print("For Race : " + str(file))
        race_data = pd.read_csv(file)
        race_data = race_data[race_data['is_race_going']]
        race_features = scalar.transform(race_data[feature_columns])
        race_data['target_variable'] = model.predict(race_features)
        race_data['target_variable'] = race_data['target_variable'].apply(lambda x : 0 if x<0 else 99.9999 if x>=100 else x)
        race_data['target_variable'] = race_data['target_variable']/100
        # pass race_data to bernoulli race function
        win_prob, rank_ = bernoulli_race(race_data)
        # win_prob, rank_ = monte_carlo_wrapper(race_data)

        remain_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='distance_to_leader')

        # plot probabilities for that race
        plotter(win_prob, rank_, remain_, file, save_folder)


In [None]:
def pipeline(folder_path):
    # takes in folder path, extracts data,
    # trains model
    # creates probability function


    X_train_scaled, X_test_scaled, y_train, y_test , scalar= fetch_clean_data(folder_path)
    model = XGBRegressor(n_estimators=100, learning_rate=0.01, random_state=42, max_depth = 5)
    model, _, _ = training_arc(model, X_train_scaled, X_test_scaled, y_train, y_test)

    return model, scalar



In [None]:
model,scalar = pipeline(folder_path)

In [None]:
X_train_scaled, X_test_scaled, y_train, y_test , scalar= fetch_clean_data(folder_path)


# get average value of y_train['target_variable'] and get std dev


y_train.mean(), y_train.std(), y_test.mean(), y_test.std()

# get average value of y_test['target_variable'] and get std dev





In [None]:
file_list, race_list = get_files(folder_path)

In [None]:
len(file_list)
train_set, test_set= train_test_file_split(file_list)
len(test_set)

In [None]:
test_set = test_set[:10]

In [None]:
bernoulli_super(test_set, model, scalar,save_folder="output-plots/orig-5-features-xgboost")

In [26]:
feature_important = model.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())
f_imp_df = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=True)
f_imp_df

Unnamed: 0,score
acceleration_1s,32.0
distance_to_leader,429.0
speed_1s,540.0
position,873.0
leader_remaining_distance,1197.0


In [1]:
file_list[0]

NameError: name 'file_list' is not defined

In [None]:
df = pd.DataFrame(file_list)
indx = df[df[0]== 'data/horse-tracking-data/AQU_2019-01-20_1.csv'].index
indx

In [None]:
race_data = pd.read_csv(file_list[0])
race_data = race_data[race_data['is_race_going']]

In [None]:
(race_data.sort_values('trakus_index'))[race_data['trakus_index']==1]

In [None]:
race_data[feature_columns].info()

In [None]:
# race_features = scalar.transform(race_data[feature_columns])
# race_features

race_data['target_variable'].mean()

In [None]:
# race_data['target_variable'] = model.predict(race_features)
race_data['target_variable'] = race_data['target_variable'].apply(lambda x : 0 if x<0 else 0.999999 if x>=1 else x)


In [None]:
race_data

In [None]:
race_data.sort_values('trakus_index', inplace=True)
race_data = race_data.reset_index()

In [None]:
race_data.columns

In [None]:
race_data['program_number'] = race_data['program_number'].astype(str).str.strip()


In [None]:
# Bernoulli
size = 1000
rank_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='position')
remain_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='leader_remaining_distance')
pred_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='target_variable')


In [None]:
rank_.columns

In [None]:
win_prob = pred_.copy() 

In [None]:
size = 1000
for i, row in pred_.iterrows():
    cur_win_prob = pd.DataFrame(index=range(size))
    for fp in pred_.columns:
        if remain_.loc[i][fp] < 0:
            cur_win_prob[fp] = [1]*1000
            continue
        fp_rank_diff = np.random.binomial(remain_.loc[i][fp], row[fp], size)
        pred_rank = rank_.loc[i][fp] - fp_rank_diff
        cur_win_prob[fp] = pred_rank
    # cur_win_prob = cur_win_prob.rank(method='min', axis=1)
    for fp in pred_.columns:
        win_prob.loc[i, fp] = sum(cur_win_prob[fp]==1) / size

# norm
win_prob = win_prob.divide(win_prob.sum(axis=1), axis=0)

In [None]:
win_prob

In [None]:
win_prob["total_prob"] = win_prob.sum(axis=1)
win_prob["total_prob"]

In [None]:
win_prob.iloc[-1]

In [None]:
# Reshape the data for Plotly
df_melted = win_prob.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='win_probability')

# Create an interactive line plot
fig = px.line(df_melted, 
              x='trakus_index', 
              y='win_probability', 
              color='program_number', 
              title='Win Probabilities Over Time',
              labels={'trakus_index': 'Time (Trakus Index)', 'win_probability': 'Win Probability'},
              template='plotly_dark')  # Optional: Use a dark theme

# Customize the layout
fig.update_layout(
    title_font_size=20,
    legend_title_text='Program Number',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Show the plot
fig.show()

In [None]:
# Reshape the data for Plotly
df_melted = win_prob.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='win_probability')

# Create an interactive line plot
fig = px.line(df_melted, 
              x='trakus_index', 
              y='win_probability', 
              color='program_number', 
              title='Win Probabilities Over Time',
              labels={'trakus_index': 'Time (Trakus Index)', 'win_probability': 'Win Probability'},
              template='plotly_dark')  # Optional: Use a dark theme

# Customize the layout
fig.update_layout(
    title_font_size=20,
    legend_title_text='Program Number',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Show the plot
fig.show()

In [None]:
# Reshape the data for Plotly
df_melted = win_prob.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='win_probability')

# Create an interactive line plot
fig = px.line(df_melted, 
              x='trakus_index', 
              y='win_probability', 
              color='program_number', 
              title='Win Probabilities Over Time',
              labels={'trakus_index': 'Time (Trakus Index)', 'win_probability': 'Win Probability'},
              template='plotly_dark')  # Optional: Use a dark theme

# Customize the layout
fig.update_layout(
    title_font_size=20,
    legend_title_text='Program Number',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Show the plot
fig.show()

In [None]:
# Reshape the data for Plotly
df_melted = rank_.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='position')

# Create an interactive line plot
fig = px.line(df_melted, 
              x='trakus_index', 
              y='position', 
              color='program_number', 
              title='Positions Over Time',
              labels={'trakus_index': 'Time (Trakus Index)', 'position': 'Position'},
              template='plotly_dark')  # Optional: Use a dark theme

# Customize the layout
fig.update_layout(
    title_font_size=20,
    legend_title_text='Program Number',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Show the plot
fig.show()

In [None]:

remain_ = pd.pivot_table(race_data, index='trakus_index', columns='program_number', values='distance_to_leader')
df_melted = remain_.reset_index().melt(id_vars='trakus_index', 
                                  var_name='program_number', 
                                  value_name='distance_to_leader')

# Create an interactive line plot
fig = px.line(df_melted, 
              x='trakus_index', 
              y='distance_to_leader', 
              color='program_number', 
              title='Distance to Leader over time',
              labels={'trakus_index': 'Time (Trakus Index)', 'distance_to_leader': 'Distance to Leader'},
              template='plotly_dark')  # Optional: Use a dark theme

# Customize the layout
fig.update_layout(
    title_font_size=20,
    legend_title_text='Program Number',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Show the plot
fig.show()

In [None]:
1, 10, 11, 2