# <center> Process of cleaning and analyzing horse racing data from Kaggle dataset "Horses for Courses"</center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

%matplotlib inline

#To supress scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
#There are a lot of issues with the 'position_two' column, so I left it out.
#Furthermore, the 'position_again' column is much more consistent and has all relevant win/place information  

fields = ["position_again","bf_odds","venue_name","date","market_name","condition","barrier","handicap_weight","last_five_starts","prize_money","sex","age","jockey","jockey_sex","trainer","days_since_last_run","overall_starts","overall_wins","overall_places","track_starts","track_wins","track_places","firm_starts","firm_wins","firm_places","good_starts","good_wins","good_places","slow_starts","slow_wins","slow_places","soft_starts","soft_wins","soft_places","heavy_starts","heavy_wins","heavy_places","distance_starts","distance_wins","distance_places"]

df = pd.read_csv("horses.csv", skipinitialspace=True, usecols=fields, low_memory=False)

df.head()

In [None]:
#Used to identify condition types
df.condition.value_counts(dropna=False)

In [None]:
df.date = pd.to_datetime(df.date, format='%Y'+'-'+'%m'+'-'+'%d')

#removes numbers from end of 'condition' string
df.condition = df.condition.str.replace('\d+', '')

#renaming condition values so that they're uniform
df.condition = df.condition.str.replace('HVY', 'HEAVY')
df.condition = df.condition.str.replace('AWT', 'GOOD') #AWT equates to a Good surface under some weather conditions

#reverses 'last_five_starts' (originally written right-to-left) so that it's easier to read in the future
df.last_five_starts = df.last_five_starts.str[::-1]

df.head()

## A prize_money_per_start column might be useful.

In [None]:
df.insert(10,'prize_money_per_start', df.prize_money/df.overall_starts)

df.drop('prize_money', axis=1, inplace=True)

df.head()

## Creating general and conditions 'win_percent' and 'place_percent' columns:

In [None]:
#creates overall, track, and distance win_percent
#and place_percent columns and drops existing wins and places columns

columns_list = ["overall","track","distance"]

for x in columns_list:
    df[x+"_win_percent"] = df[x+"_wins"]/df[x+"_starts"]
    
    df[x+"_place_percent"] = df[x+"_places"]/df[x+"_starts"]
  
    df.drop([x+'_starts', x+'_wins', x+'_places'], axis=1, inplace=True)

In [None]:
#creates a win_percent and place_percent column for the current condition of the track

df.loc[df.condition.isna(), "condition_win_percent"] = np.nan

condition_list = ["firm","good","slow","soft","heavy"]

for x in condition_list:    
    
    df.loc[df.condition.str.lower() == x, "condition_win_percent"] = df[x+"_wins"]/df[x+"_starts"]
    
    df.loc[df.condition.str.lower() == x, "condition_place_percent"] = df[x+"_places"]/df[x+"_starts"]
    
    df.drop([x+'_starts', x+'_wins', x+'_places'], axis=1, inplace=True)

# Condition column is not necessary now that there are condition win and place percent columns 
df.drop('condition', axis=1, inplace=True)

# Replaces infinity (zero division) with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

## Cleaning data by removing races with missing win and/or place values in 'position_again' column:

In [None]:
#Used groupby to create indices by which to sort the re-indexed dataframe, 'df_indexed', below

df_grouped = df.groupby(['date','venue_name','market_name'])

df_grouped.head(20)

In [None]:
df_grouped.position_again.value_counts()

In [None]:
#Drops all groups where there is no place values in 'position_again' column

index_list = df_grouped.position_again.sum().where(lambda x:x>2).dropna().index

In [None]:
df_indexed = df.set_index(['date','venue_name','market_name'])

df_cleaned1 = df_indexed.loc[index_list]

In [None]:
df_cleaned1.sort_index(inplace=True)

df_cleaned1

## Next, I create an 'average_weight' column for each race to create a weight_minus_average column and drop the handicap_weight column.

In [None]:
#Regroups df_cleaned1 dataframe by same columns as before

df_grouped2 = df_cleaned1.groupby(['date','venue_name','market_name'])

df_grouped2.head()

In [None]:
average_weight = df_grouped2.handicap_weight.transform('mean')

df_cleaned1.insert(4,'average_weight',average_weight)

df_cleaned1.head()

In [None]:
#Creates weight_minus_average column and drops 'handicap_weight','average_weight' columns

df_cleaned1.insert(5,'weight_minus_average', df_cleaned1.handicap_weight - df_cleaned1.average_weight)

df_cleaned1.drop(['handicap_weight','average_weight'], axis=1, inplace=True)

df_cleaned1.head()

In [None]:
df_grouped3 = df_cleaned1.groupby(['date','venue_name','market_name'])

In [None]:
# Number of usable races:
len(df_grouped3)

In [None]:
#Percentage of each gender in races where both genders are represented (so that gender win percentage is accurate)
jockey_sex_population = df_grouped3.jockey_sex.value_counts(normalize=True).where(lambda x: (x != 1) & (x.notna())).dropna()

jockey_sex_population.mean(level=3)

In [None]:
jockey_sex_win = df_grouped3.position_again.apply(lambda x: x == 1).index

In [None]:
df_cleaned1.loc[jockey_sex_win]