# Process of cleaning and analyzing horse racing data from Kaggle dataset "Horses for Courses"

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

%matplotlib inline

In [2]:
fields = ["position_again","position_two","bf_odds","venue_name","date","market_name","condition","barrier","handicap_weight","last_five_starts","prize_money","sex","age","jockey","jockey_sex","trainer","days_since_last_run","runs_since_spell","overall_starts","overall_wins","overall_places","track_starts","track_wins","track_places","firm_starts","firm_wins","firm_places","good_starts","good_wins","good_places","dead_starts","dead_wins","dead_places","slow_starts","slow_wins","slow_places","soft_starts","soft_wins","soft_places","heavy_starts","heavy_wins","heavy_places","distance_starts","distance_wins","distance_places"]

df = pd.read_csv("horses.csv", skipinitialspace=True, usecols=fields, low_memory=False)

  interactivity=interactivity, compiler=compiler, result=result)


### The following reassigns variable types

In [9]:
df.date = pd.to_datetime(df.date, format='%Y'+'-'+'%m'+'-'+'%d')
df.market_name = df.market_name.astype('str') 
df.venue_name = df.venue_name.astype('str')
df.condition = df.condition.astype('str')
df.sex = df.sex.astype('str')
df.last_five_starts = df.last_five_starts.astype('str')

df = df.rename(index=str, columns={"position_two":"all_finish"})

df.head()

Unnamed: 0,position_again,all_finish,bf_odds,venue_name,date,market_name,condition,barrier,handicap_weight,last_five_starts,...,slow_places,soft_starts,soft_wins,soft_places,heavy_starts,heavy_wins,heavy_places,distance_starts,distance_wins,distance_places
0,1.0,1.0,2.88,Echuca,2016-06-27,R3 1200m Mdn,HVY9,3.0,58.5,['f3'],...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,2.0,2.0,15.0,Echuca,2016-06-27,R3 1200m Mdn,HVY9,10.0,58.5,['x80x2'],...,1.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,3.0,3.0,95.0,Echuca,2016-06-27,R3 1200m Mdn,HVY9,6.0,56.5,['79x00'],...,0.0,5.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0
3,,4.0,20.0,Echuca,2016-06-27,R3 1200m Mdn,HVY9,2.0,56.5,['f0'],...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,,5.0,2.74,Echuca,2016-06-27,R3 1200m Mdn,HVY9,7.0,56.5,['f4x'],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#dropping these now and will potentially add them back in later

df_main = df.drop(['jockey','jockey_sex','trainer','runs_since_spell','track_starts','track_wins','track_places','firm_starts','firm_wins','firm_places','good_starts','good_wins','good_places','dead_starts','dead_wins','dead_places','slow_starts','slow_wins','slow_places','soft_starts','soft_wins','soft_places','heavy_starts','heavy_wins','heavy_places','distance_starts','distance_wins','distance_places'], axis=1)

df_main.head()

### Trying to clean data, I find:

In [None]:
df_main[(df_main["all_finish"] == df_main.shift(-1)["all_finish"]) & (df_main["all_finish"] == 1) & (df_main["position_again"] != df_main["all_finish"])]

#there are doubled first-place finishes (false ties) in all_finish column, which is inconsistent with position_again 

In [None]:
df_main[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & ((df_main.shift(-1)["position_again"] == df_main.shift(-1)["all_finish"]) & (df_main.shift(-1)["position_again"] == 2)) & (((df_main.shift(-2)["position_again"] == df_main.shift(-2)["all_finish"]) & (df_main.shift(-2)["position_again"] == 3)) | ((df_main.shift(-2)["position_again"].isna()) & (df_main.shift(-2)["all_finish"] == 3)))]

#filters out values where position_again and all_finish are not consistent

In [None]:
clean_indices = df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & ((df_main.shift(-1)["position_again"] == df_main.shift(-1)["all_finish"]) & (df_main.shift(-1)["position_again"] == 2)) & (((df_main.shift(-2)["position_again"] == df_main.shift(-2)["all_finish"]) & (df_main.shift(-2)["position_again"] == 3)) | ((df_main.shift(-2)["position_again"].isna()) & (df_main.shift(-2)["all_finish"] == 3)))]

clean_indices

In [None]:
#finds all races where there are 0 or NaN values above the first place position of the same race

shift_one_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(1)["market_name"]) & (df_main["venue_name"] == df_main.shift(1)["venue_name"]) & (df_main["date"] == df_main.shift(1)["date"])])
shift_two_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(2)["market_name"]) & (df_main["venue_name"] == df_main.shift(2)["venue_name"]) & (df_main["date"] == df_main.shift(2)["date"])])
shift_three_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(3)["market_name"]) & (df_main["venue_name"] == df_main.shift(3)["venue_name"]) & (df_main["date"] == df_main.shift(3)["date"])])
shift_four_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(4)["market_name"]) & (df_main["venue_name"] == df_main.shift(4)["venue_name"]) & (df_main["date"] == df_main.shift(4)["date"])])
shift_five_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(5)["market_name"]) & (df_main["venue_name"] == df_main.shift(5)["venue_name"]) & (df_main["date"] == df_main.shift(5)["date"])])
shift_six_indices = list(df_main.index[((df_main["position_again"] == df_main["all_finish"]) & (df_main["position_again"] == 1)) & (df_main["market_name"] == df_main.shift(6)["market_name"]) & (df_main["venue_name"] == df_main.shift(6)["venue_name"]) & (df_main["date"] == df_main.shift(6)["date"])])

print("shift_one_indices:",len(shift_one_indices),shift_one_indices)
print("shift_two_indices:",len(shift_two_indices),shift_two_indices)
print("shift_three_indices:",len(shift_three_indices),shift_three_indices)
print("shift_four_indices:",len(shift_four_indices),shift_four_indices)
print("shift_five_indices:",len(shift_five_indices),shift_five_indices)
print("shift_six_indices:",len(shift_six_indices),shift_six_indices)

shift_all_indices = list(set(shift_one_indices+shift_two_indices+shift_three_indices,shift_four_indices+shift_five_indices))

In [None]:
# Example of shifted indices where values of race are above first place finish of that race.

x=5405

df_main.iloc[[x-6,x-5,x-4,x-3,x-2,x-1,x,x+1,x+2,x+3]]

In [None]:
#adds win percentage and place percentage columns and drops "overall" columns

try:
    df_main["n_win_percent"] = df_main["overall_wins"]/df_main["overall_starts"]
except ZeroDivisionError:
    df_main["n_win_percent"] = np.nan
    
try:
    df_main["o_place_percent"] = df_main["overall_places"]/df_main["overall_starts"]
except ZeroDivisionError:
    df_main["o_place_percent"] = np.nan
    
df_main = df_main.drop(['position_again', 'overall_wins', 'overall_starts', 'overall_places'], axis=1)

In [None]:
#might finish and use this to fix the shift_all_indices

'''

checker = []

for i in range(len(wrong_indices)):
    if i == len(wrong_indices)-1:
        break
    else:
        first_counter = 0
        value = all_finish[wrong_indices[i]]
        while value != 1:
            first_counter += 1
            value = all_finish[wrong_indices[i]+first_counter]
        place = venue_name[wrong_indices[i]+first_counter]
        race = market_name[wrong_indices[i]+first_counter]
        find_counter = 0
        while (place == wrong_indices[i]+first_counter) & (race == market_name[wrong_indices[i]+first_counter]):
            find_counter -= 1
            place = venue_name[wrong_indices[i]+find_counter]
            race = market_name[wrong_indices[i]+find_counter]
        place = venue_name[wrong_indices[i]+first_counter]
        race = market_name[wrong_indices[i]+first_counter]
        replace_count = 0
        while (place == wrong_indices[i]+first_counter) & (race == market_name[wrong_indices[i]+first_counter]):
            replace_count += 1
            place = venue_name[wrong_indices[i]+replace_count]
            race = market_name[wrong_indices[i]+replace_count]
        for j in range((wrong_indices[i]+replace_count-1) - (wrong_indices[i]+first_counter+1)):
            
        ...
                    
'''

In [None]:
#makes a new dataframe of all good races in "clean_values"

df_clean = pd.DataFrame()

for i in clean_indices:
    counter = 0
    value = 0
    while value != 1:
        counter += 1
        value = df_main["all_finish"][int(i)+counter]
    if (str(int(i)+counter) in clean_indices) & (str(int(i)+counter) not in shift_all_indices):
        for j in range(counter-1):
            df_clean = df_clean.append(df_main.iloc[int(i)+j], ignore_index = True)

### The following splits the market_name column into three separate columns

In [None]:
df_clean.market_name.unique()

In [None]:
new = df_clean["market_name"].str.split(" ", n = 2, expand = True) 

In [None]:
df_clean.insert(4, "race_number", new[0])
df_clean.insert(5, "race_length", new[1])
df_clean.insert(6, "race_type", new[2])

In [None]:
df_clean.drop(columns =["market_name"], inplace = True)
df_clean.head()

In [None]:
df_clean.race_type.unique()

### Next, I will find the average weight of each race to create a handicap_weight_minus_average column.