In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import plotly.express as px

In [2]:
column_names = [
    "track_id",         # Track ID
    "race_date",        # Race Date
    "race_number",      # Race Number
    "program_number",   # Program Number
    "weight_carried",   # Weight Carried
    "jockey",           # Jockey Name
    "odds",             # Odds
    "position_at_finish"  # Finish Position
]

df_start = pd.read_csv("big-data-derby-2022\\nyra_start_table.csv", header=None, names=column_names)
df_race = pd.read_csv('big-data-derby-2022\\nyra_race_table.csv')
df_tracking = pd.read_csv('my-data\\nyra_tracking_table_utm.csv')
df_complete = pd.read_csv('big-data-derby-2022\\nyra_2019_complete.csv')

  df_tracking = pd.read_csv('my-data\\nyra_tracking_table_utm.csv')
  df_complete = pd.read_csv('big-data-derby-2022\\nyra_2019_complete.csv')


In [3]:
# add unique_id 
df_race['unique_id'] = df_race["track_id"] + "_" + df_race["race_date"] + "_" + df_race["race_number"].astype(str)
df_tracking['unique_id'] = df_tracking["track_id"] + "_" + df_tracking['race_date'] + "_" + df_tracking['race_number'].astype(str)

In [4]:
track_ids = df_race['unique_id'].unique()

In [5]:
def generate_track_geometry(df_race_data):
    # Group by trakus_index and compute the average easting/northing for the track geometry
    track_points = (
        df_race_data.groupby('trakus_index')[['easting', 'northing']].mean().reset_index()
    )

    # Calculate midpoints and perpendicular vectors
    midpoints = []
    perpendicular_vectors = []

    for i in range(len(track_points) - 1):
        # Get consecutive points
        x1, y1 = track_points.loc[i, ['easting', 'northing']]
        x2, y2 = track_points.loc[i + 1, ['easting', 'northing']]

        # Calculate midpoint
        x_mid = (x1 + x2) / 2
        y_mid = (y1 + y2) / 2
        midpoints.append((x_mid, y_mid))

        # Calculate direction vector
        delta_x = x2 - x1
        delta_y = y2 - y1

        # Calculate perpendicular vector
        perp_x = -delta_y
        perp_y = delta_x
        magnitude = np.sqrt(perp_x**2 + perp_y**2)
        perp_vector = (perp_x / magnitude, perp_y / magnitude)  # Normalize
        perpendicular_vectors.append(perp_vector)

    # Convert to DataFrame for easy use
    midpoints_df = pd.DataFrame(midpoints, columns=['mid_easting', 'mid_northing'])
    perp_vectors_df = pd.DataFrame(perpendicular_vectors, columns=['perp_x', 'perp_y'])

    # Combine midpoints and perpendicular vectors
    track_geometry = pd.concat([midpoints_df, perp_vectors_df], axis=1)
    return track_geometry

In [6]:
# Function to calculate angles between consecutive vectors
def calculate_angle(v1, v2):
    # Normalize vectors
    v1 = v1 / np.linalg.norm(v1)
    v2 = v2 / np.linalg.norm(v2)
    # Dot product and arccos to calculate angle (in radians)
    dot_product = np.dot(v1, v2)
    angle = np.arccos(np.clip(dot_product, -1.0, 1.0))  # Clip for numerical stability
    return np.degrees(angle)  # Convert to degrees

def generate_angles(track_geometry):
    # Add angle calculation to the dataframe
    angles = []
    for i in range(1, len(track_geometry)):
        v1 = np.array([track_geometry.loc[i - 1, 'perp_x'], track_geometry.loc[i - 1, 'perp_y']])
        v2 = np.array([track_geometry.loc[i, 'perp_x'], track_geometry.loc[i, 'perp_y']])
        angle = calculate_angle(v1, v2)
        angles.append(angle)

    angles.insert(0, 0)  # First row has no angle (NaN equivalent)
    track_geometry['angle_change'] = angles
    return track_geometry

def generate_transition_indices(track_geometry, start_threshold=1, end_threshold=1):
    # Detect transitions
    in_transition = False
    transition_indices = []
    for i in range(1, len(track_geometry)):
        angle = track_geometry['angle_change'][i]

        if angle > start_threshold and not in_transition:
            # Transition starts
            in_transition = True
            transition_indices.append(i)  # Mark start of transition

        elif angle < end_threshold and in_transition:
            # Transition ends
            in_transition = False
            transition_indices.append(i)  # Mark end of transition

    return transition_indices

In [122]:
def calculate_euclidean_distance(p1, p2):
    return np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)

def generate_cum_distance(track_geometry):
    track_geometry['cumulative_distance'] = 0.0
    for i in range(1, len(track_geometry)):
        prev_point = (track_geometry.loc[i - 1, 'mid_easting'], track_geometry.loc[i - 1, 'mid_northing'])
        curr_point = (track_geometry.loc[i, 'mid_easting'], track_geometry.loc[i, 'mid_northing'])

        track_geometry.loc[i, 'cumulative_distance'] = track_geometry.loc[i - 1, 'cumulative_distance'] + \
            calculate_euclidean_distance(prev_point, curr_point)
    return track_geometry

In [89]:
def calculate_line_equation(mid_easting, mid_northing, perp_x, perp_y):
    A = perp_y
    B = -perp_x
    C = -(A * mid_easting + B * mid_northing)
    return A, B, C

In [90]:
def point_to_line_distance_and_angle(x, y, A, B, C):
    distance = abs(A * x + B * y + C) / math.sqrt(A**2 + B**2)
    sign = 1 if A * x + B * y + C >= 0 else -1
    # print(f"Distance {distance} and sign {sign}")
    return distance, sign

In [91]:
def add_line_equations(track_geometry):
    track_geometry['A'], track_geometry['B'], track_geometry['C'] = zip(*track_geometry.apply(
        lambda row: calculate_line_equation(row['mid_easting'], row['mid_northing'], row['perp_x'], row['perp_y']),
        axis=1
    ))
    return track_geometry

In [92]:
def add_start_line(track_geometry, horse_tracking_data):
    trakus_index = 1
    horse_tracking_data_at_1 = horse_tracking_data[horse_tracking_data['trakus_index'] == trakus_index]
    max_dist = 0
    max_dist_point = None
    A, B, C = track_geometry.loc[0, ['A', 'B', 'C']]

    for i, row in horse_tracking_data_at_1.iterrows():
        horse_pos = (row['easting'], row['northing'])
        dist, sign = point_to_line_distance_and_angle(horse_pos[0], horse_pos[1], A, B, C)
        if dist > max_dist and sign == -1:
            max_dist = dist
            max_dist_point = horse_pos
    # print(max_dist, max_dist_point)
    newA, newB, newC = calculate_line_equation(max_dist_point[0], max_dist_point[1], track_geometry.loc[0, 'perp_x'], track_geometry.loc[0, 'perp_y'])
    perp_x = track_geometry.loc[0, 'perp_x']
    perp_y = track_geometry.loc[0, 'perp_y']
    # add the new line to the track geometry as the first row
    track_geometry = pd.concat([pd.DataFrame([[0, 0, perp_x, perp_y, 0, max_dist, newA, newB, newC]], columns=track_geometry.columns), track_geometry], ignore_index=True)
    track_geometry['cumulative_distance'] = track_geometry['cumulative_distance'] + track_geometry.loc[0, 'cumulative_distance']
    track_geometry.loc[0, 'cumulative_distance'] = 0
    return track_geometry

In [93]:
def generate_race_ending_index(track_geometry, df_race_details, unique_id):
    df_race_details = df_race_details[df_race_details['unique_id'] == unique_id]
    furlongs = df_race_details['distance_id'].values[0]
    distance = (furlongs/100)*201.168
    for i in range(1, len(track_geometry)):
        if track_geometry.loc[i, 'cumulative_distance'] > distance:
            return i, distance
        else:
            return len(track_geometry)-1, distance

In [134]:
# Function to calculate cumulative distance travelled
def calculate_cumulative_distance(horse_tracking_data, track_geometry):
    horse_tracking_data['cumulative_distance_travelled'] = 0.0  # Initialize column
    
    # Group horse data by unique program_number
    horse_groups = horse_tracking_data.groupby('program_number')
    # print(f"Processing {len(horse_groups)} unique horses.")
    
    for program_number, horse_data in horse_groups:
        # print(f"Processing horse: {program_number}")
        
        # Sort horse data by trakus_index
        horse_data = horse_data.sort_values(by='trakus_index').reset_index(drop=True)
        
        last_crossed_index = 0  # Start with the first line
        cumulative_distance = 0.0  # Initialize cumulative distance

        for i, row in horse_data.iterrows():
            horse_pos = (row['easting'], row['northing'])
            
            # Limit the search to lines from index to index+10
            search_start = max(0, last_crossed_index)
            search_end = min(len(track_geometry), search_start + 5)
            
            closest_line_idx = None
            closest_distance = float('inf')
            closest_sign = 0
            
            # print(f"Row {i} calculation")
            # Iterate through the relevant track geometry lines
            for j in range(search_start, search_end):
                track_row = track_geometry.iloc[j]
                A, B, C = track_row['A'], track_row['B'], track_row['C']
                # print(f"Track row {j}")
                dist, sign = point_to_line_distance_and_angle(horse_pos[0], horse_pos[1], A, B, C)
                # print(f"Distance {dist} and {sign}")
                if dist < closest_distance and sign == 1:
                    closest_distance = dist
                    closest_line_idx = j
                    closest_sign = sign
                    # print(f"Closest distance is {closest_distance} at {closest_line_idx} with sign {closest_sign}")
                    
            # Use the cumulative distance of the last crossed line from track_geometry
            if closest_line_idx:
                last_crossed_index = closest_line_idx
                cumulative_distance = track_geometry.iloc[last_crossed_index]['cumulative_distance']
                total_distance = cumulative_distance + closest_distance
            else:
                cumulative_distance = track_geometry.iloc[last_crossed_index]['cumulative_distance']
                total_distance = cumulative_distance
            # print(f"Total distance is {total_distance} at {last_crossed_index} with distance {closest_distance} with cum dist {cumulative_distance}")
            # Update horse_tracking_data
            horse_tracking_data.loc[
                (horse_tracking_data['program_number'] == program_number) & 
                (horse_tracking_data['trakus_index'] == row['trakus_index']),
                'cumulative_distance_travelled'
            ] = total_distance
            # print(f"Trakus index {row['trakus_index']}: Distance = {total_distance:.2f} meters")
    return horse_tracking_data


In [102]:
def generate_positions(horse_tracking_data):
    horse_tracking_data['position'] = (
        horse_tracking_data.groupby('trakus_index')['cumulative_distance_travelled']
        .rank(ascending=False, method='first')
        .astype(int)
    )
    return horse_tracking_data

In [103]:
def find_finish_line_trakus_index(horse_tracking_data, finish_line_distance):
    finish_line_crossings = horse_tracking_data[
        horse_tracking_data['cumulative_distance_travelled'] >= finish_line_distance
    ]

    if not finish_line_crossings.empty:
        finish_trakus_index = finish_line_crossings['trakus_index'].min()
        return finish_trakus_index
    else:
        return None 

In [104]:
def generate_final_positions(horse_tracking_data, finish_trakus_index):
    final_positions = horse_tracking_data[horse_tracking_data['trakus_index'] == finish_trakus_index]
    final_positions = final_positions.sort_values(by='cumulative_distance_travelled', ascending=False).reset_index(drop=True)
    position_dict = {position + 1: row['program_number'] for position, row in final_positions.iterrows()}
    return position_dict

In [191]:
def generate_final_positions_alternate(horse_tracking_data, race_distance):
    """
    Generate the final positions of horses based on the first trakus index at which they crossed 
    the finish line. If multiple horses crossed at the same index, use cumulative distance as a tiebreaker.

    Args:
    horse_tracking_data (DataFrame): A DataFrame containing horse tracking data with columns
                                      ['program_number', 'trakus_index', 'cumulative_distance_travelled', 'position'].
    race_distance (float): The total race distance.

    Returns:
    dict: A dictionary mapping final positions (1, 2, 3, ...) to program numbers of the horses.
    """
    # Track the first instance where each horse crosses the finish line
    finishers = []
    for program_number, horse_data in horse_tracking_data.groupby('program_number'):
        # Find the first instance where the horse crosses the finish line
        first_crossing = horse_data[horse_data['cumulative_distance_travelled'] >= race_distance].head(1)
        if not first_crossing.empty:
            finishers.append(first_crossing.iloc[0])

    # Convert to DataFrame for sorting
    finishers_df = pd.DataFrame(finishers)

    # Sort by trakus_index (ascending) and cumulative_distance_travelled (descending for tiebreakers)
    finishers_df = finishers_df.sort_values(by=['trakus_index', 'cumulative_distance_travelled'], ascending=[True, False])

    # Assign positions based on the sorted order
    position_dict = {}
    for position, row in enumerate(finishers_df.itertuples(index=False), start=1):
        position_dict[position] = row.program_number

    return position_dict


In [105]:
def big_final_function(track_id):
    horse_tracking_data = df_tracking[df_tracking['unique_id'] == track_id]
    horse_tracking_data = horse_tracking_data.copy()
    track_geometry = generate_track_geometry(horse_tracking_data)
    track_geometry = generate_angles(track_geometry)
    transition_indices = generate_transition_indices(track_geometry)
    track_geometry = generate_cum_distance(track_geometry)
    track_geometry = add_line_equations(track_geometry)
    horse_tracking_data = horse_tracking_data.sort_values(by=['program_number', 'trakus_index']).reset_index(drop=True)
    track_geometry = add_start_line(track_geometry, horse_tracking_data)
    index, race_distance = generate_race_ending_index(track_geometry, df_race, track_id)
    horse_tracking_data = calculate_cumulative_distance(horse_tracking_data, track_geometry)
    horse_tracking_data = generate_positions(horse_tracking_data)
    finish_trakus_index = find_finish_line_trakus_index(horse_tracking_data, race_distance)
    final_positions = generate_final_positions(horse_tracking_data, finish_trakus_index)
    return horse_tracking_data, track_geometry, final_positions, transition_indices

In [192]:
def calculate_final_positions(track_id, horse_tracking_data, track_geometry, df_race):
    distance, race_distance = generate_race_ending_index(track_geometry, df_race, track_id)
    # finish_trakus_index = find_finish_line_trakus_index(horse_tracking_data, race_distance)
    final_positions = generate_final_positions_alternate(horse_tracking_data, race_distance)
    return final_positions

In [106]:
track_ids[0]

'AQU_2019-01-01_1'

In [111]:
race_dict = {}

for _, row in df_start.iterrows():
    # Construct the unique race key
    race_key = f"{row['track_id']}_{row['race_date']}_{row['race_number']}"
    
    # Initialize nested dictionary for the race if it doesn't exist
    if race_key not in race_dict:
        race_dict[race_key] = {}
    
    # Assign the program number to the position in the race dictionary
    race_dict[race_key][row['position_at_finish']] = row['program_number']

In [112]:
race_data = []
for race_key, positions in race_dict.items():
    # Append a dictionary for each race_key with positions as a dictionary of {position: program_number}
    race_data.append({'race_key': race_key, 'positions': positions})

# Create DataFrame
df_race_positions = pd.DataFrame(race_data)

In [140]:
df_race_positions.head()

Unnamed: 0,race_key,positions,calculated_positions
0,AQU_2019-01-01_1,"{2: '1 ', 3: '2 ', 4: '3 ', 5: '4 ', 1: '5...",
1,AQU_2019-01-01_2,"{2: '1 ', 7: '2 ', 4: '3 ', 5: '4 ', 3: '5...",6.0
2,AQU_2019-01-01_3,"{2: '1 ', 8: '2 ', 1: '3 ', 3: '4 ', 7: '5...",3.0
3,AQU_2019-01-01_4,"{1: '1 ', 4: '1A ', 2: '3 ', 5: '4 ', 6: '5...",6.0
4,AQU_2019-01-01_5,"{3: '1 ', 6: '2 ', 7: '3 ', 4: '5 ', 1: '6...",5.0


In [117]:
# path to save tracking geometry and horse_tracking_data
track_geometry_path = "my-data\\track-geometry\\"
horse_tracking_data_path = "my-data\\horse-tracking-data\\"

In [114]:
from tqdm import tqdm

In [136]:
df_race_positions.head()

Unnamed: 0,race_key,positions,calculated_positions
0,AQU_2019-01-01_1,"{2: '1 ', 3: '2 ', 4: '3 ', 5: '4 ', 1: '5...",
1,AQU_2019-01-01_2,"{2: '1 ', 7: '2 ', 4: '3 ', 5: '4 ', 3: '5...",6.0
2,AQU_2019-01-01_3,"{2: '1 ', 8: '2 ', 1: '3 ', 3: '4 ', 7: '5...",3.0
3,AQU_2019-01-01_4,"{1: '1 ', 4: '1A ', 2: '3 ', 5: '4 ', 6: '5...",6.0
4,AQU_2019-01-01_5,"{3: '1 ', 6: '2 ', 7: '3 ', 4: '5 ', 1: '6...",5.0


In [None]:
track_geometry_path = "my-data\\track-geometry\\"
horse_tracking_data_path = "my-data\\horse-tracking-data\\"

transition_indices_dict = {}
for i, track_id in tqdm(enumerate(track_ids)):
    if i < 1667:
        continue
    try:
        horse_tracking_data, track_geometry, final_positions, transition_indices = big_final_function(track_id)
        
        if horse_tracking_data is not None and track_geometry is not None:
            # Save DataFrames to CSV
            track_geometry.to_csv(track_geometry_path + track_id + ".csv", index=False)
            horse_tracking_data.to_csv(horse_tracking_data_path + track_id + ".csv", index=False)
        
        if final_positions is not None:
            # Update DataFrame with final positions
            df_race_positions.loc[df_race_positions['race_key'] == track_id, 'calculated_positions'] = str(final_positions)
        
        # Store transition indices
        transition_indices_dict[track_id] = transition_indices
    except:
        print(f"Track ID {track_id} skipped")


In [139]:
horse_tracking_data.head()

Unnamed: 0,track_id,race_date,race_number,program_number,trakus_index,latitude,longitude,easting,northing,unique_id,cumulative_distance_travelled,position
0,SAR,2019-09-02,11,6,1,43.068787,-73.77181,599996.557709,4769185.0,SAR_2019-09-02_11,0.0,2
1,SAR,2019-09-02,11,6,2,43.068803,-73.77177,599999.740701,4769187.0,SAR_2019-09-02_11,9.701116,3
2,SAR,2019-09-02,11,6,3,43.068819,-73.771731,600002.869693,4769189.0,SAR_2019-09-02_11,13.332005,3
3,SAR,2019-09-02,11,6,4,43.068835,-73.771695,600005.77978,4769191.0,SAR_2019-09-02_11,16.715059,2
4,SAR,2019-09-02,11,6,5,43.068852,-73.771656,600008.96593,4769193.0,SAR_2019-09-02_11,20.444537,2


In [193]:
dict_of_positions = {}
for i, track_id in tqdm(enumerate(track_ids)):
    try:
        track_geometry = pd.read_csv(track_geometry_path + track_id + ".csv")
        horse_tracking_data = pd.read_csv(horse_tracking_data_path + track_id + ".csv")
        final_positions = calculate_final_positions(track_id, horse_tracking_data, track_geometry, df_race)
        dict_of_positions[track_id] = final_positions
    except:
        print(track_id)

1680it [00:30, 55.54it/s]

SAR_2019-07-21_2


1687it [00:30, 58.95it/s]

SAR_2019-07-24_2
SAR_2019-07-24_5


1729it [00:31, 53.32it/s]

SAR_2019-07-28_4


1802it [00:32, 55.15it/s]

SAR_2019-08-08_4


1869it [00:33, 52.48it/s]

SAR_2019-08-17_2


2000it [00:36, 54.90it/s]


In [None]:
dict_of_positions

In [195]:
df_race_positions['predicted_positions_new_alt'] = df_race_positions['race_key'].map(dict_of_positions)

In [196]:
df_race_positions.head()

Unnamed: 0,race_key,positions,calculated_positions,predicted_positions,predicted_positions_new,predicted_positions_new_alt
0,AQU_2019-01-01_1,"{2: '1 ', 3: '2 ', 4: '3 ', 5: '4 ', 1: '5...",,"{1: 1, 2: 5, 3: 2, 4: 3, 5: 4}","{1: 1, 3: 2, 4: 3, 5: 4, 2: 5}","{1: 1, 2: 5, 3: 2, 4: 3, 5: 4}"
1,AQU_2019-01-01_2,"{2: '1 ', 7: '2 ', 4: '3 ', 5: '4 ', 3: '5...",6.0,"{1: 6, 2: 1, 3: 5, 4: 3, 5: 4, 6: 2, 7: 7}","{2: 1, 7: 2, 4: 3, 5: 4, 3: 5, 1: 6, 6: 7}","{1: 6, 2: 1, 3: 5, 4: 3, 5: 4, 6: 7, 7: 2}"
2,AQU_2019-01-01_3,"{2: '1 ', 8: '2 ', 1: '3 ', 3: '4 ', 7: '5...",3.0,"{1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 8, 7: 5, 8: 2}","{1: 1, 8: 2, 2: 3, 3: 4, 7: 5, 4: 6, 5: 7, 6: 8}","{1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 8, 7: 5, 8: 2}"
3,AQU_2019-01-01_4,"{1: '1 ', 4: '1A ', 2: '3 ', 5: '4 ', 6: '5...",6.0,"{1: '3 ', 2: '1 ', 3: '6 ', 4: '1A ', 5: '4...","{2: '1 ', 4: '1A ', 1: '3 ', 5: '4 ', 6: '5...","{1: '3 ', 2: '1 ', 3: '6 ', 4: '1A ', 5: '4..."
4,AQU_2019-01-01_5,"{3: '1 ', 6: '2 ', 7: '3 ', 4: '5 ', 1: '6...",5.0,"{1: 6, 2: 8, 3: 1, 4: 5, 5: 7, 6: 2, 7: 3}","{3: 1, 6: 2, 7: 3, 4: 5, 1: 6, 5: 7, 2: 8}","{1: 6, 2: 8, 3: 1, 4: 5, 5: 7, 6: 2, 7: 3}"


In [201]:
match_count = 0
mismatch_count = 0

# Iterate over each row to compare the actual and predicted positions for the winner (position 1)
for index, row in df_race_positions.iterrows():
    # Safely extract the program numbers for the winner (position 1) from both positions dictionaries
    actual_positions = row['positions']
    predicted_positions = row['predicted_positions_new_alt']

    # Ensure both are dictionaries before accessing .get()
    actual_winner = None
    predicted_winner = None

    if isinstance(actual_positions, dict):
        actual_winner = actual_positions.get(3)
    else:
        continue
    if isinstance(predicted_positions, dict):
        predicted_winner = predicted_positions.get(3)
    else:
        continue

    # Ensure both are strings and normalize values by stripping spaces
    actual_winner = str(actual_winner).strip() if actual_winner is not None else None
    predicted_winner = str(predicted_winner).strip() if predicted_winner is not None else None

    # Check if they match and update counters
    if actual_winner == predicted_winner:
        match_count += 1
    else:
        mismatch_count += 1

# Print final counts of matches and mismatches
print(f"\nTotal Matches (True): {match_count}")
print(f"Total Mismatches (False): {mismatch_count}")


Total Matches (True): 1595
Total Mismatches (False): 399


In [198]:
# Initialize counters
true_matches = 0
mismatches = 0
second_position_matches = 0
third_position_matches = 0
fourth_position_matches = 0

# Iterate over each row to check for matches and specific position matches in mismatches
for index, row in df_race_positions.iterrows():
    # Safely extract actual positions and calculated positions
    actual_positions = row['positions']
    calculated_positions = row['predicted_positions']

    # Ensure they are dictionaries
    if not isinstance(actual_positions, dict):
        actual_positions = {}
        continue
    if not isinstance(calculated_positions, dict):
        calculated_positions = {}
        continue

    # Extract the program number for the actual winner (position 1)
    actual_winner = actual_positions.get(1)
    
    # Extract the calculated positions
    calculated_winner = calculated_positions.get(1)
    calculated_second = calculated_positions.get(2)
    calculated_third = calculated_positions.get(3)
    calculated_fourth = calculated_positions.get(4)

    # Normalize values (convert to strings and strip spaces)
    actual_winner = str(actual_winner).strip() if actual_winner is not None else None
    calculated_winner = str(calculated_winner).strip() if calculated_winner is not None else None
    calculated_second = str(calculated_second).strip() if calculated_second is not None else None
    calculated_third = str(calculated_third).strip() if calculated_third is not None else None
    calculated_fourth = str(calculated_fourth).strip() if calculated_fourth is not None else None

    # Check for a match at the first position
    if actual_winner == calculated_winner:
        true_matches += 1
    else:
        # Mismatch at the first position
        mismatches += 1
        
        # Check if the actual winner is in the second predicted position
        if actual_winner == calculated_second:
            second_position_matches += 1
        # If not in second, check the third position
        elif actual_winner == calculated_third:
            third_position_matches += 1
        # If not in second or third, check the fourth position
        elif actual_winner == calculated_fourth:
            fourth_position_matches += 1

# Total number of races
total_races = len(df_race_positions)

# Calculate percentages
true_match_percentage = (true_matches / total_races) * 100
mismatch_percentage = (mismatches / total_races) * 100
second_position_match_percentage = (second_position_matches / total_races) * 100
third_position_match_percentage = (third_position_matches / total_races) * 100
fourth_position_match_percentage = (fourth_position_matches / total_races) * 100

# Display counts and percentages
print(f"True Matches: {true_matches} ({true_match_percentage:.2f}%)")
print(f"Mismatches: {mismatches} ({mismatch_percentage:.2f}%)")
print(f"Mismatches where 2nd Predicted Position Won: {second_position_matches} ({second_position_match_percentage:.2f}%)")
print(f"Mismatches where 3rd Predicted Position Won: {third_position_matches} ({third_position_match_percentage:.2f}%)")
print(f"Mismatches where 4th Predicted Position Won: {fourth_position_matches} ({fourth_position_match_percentage:.2f}%)")


True Matches: 1846 (92.30%)
Mismatches: 148 (7.40%)
Mismatches where 2nd Predicted Position Won: 125 (6.25%)
Mismatches where 3rd Predicted Position Won: 18 (0.90%)
Mismatches where 4th Predicted Position Won: 0 (0.00%)


In [203]:
# save the final positions
df_race_positions.to_csv("my-data\\final_positions_predicted.csv", index=False)

In [1]:
# 