# <center> Process of cleaning and analyzing horse racing data from Kaggle dataset "Horses for Courses"</center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chisquare, ttest_ind

%matplotlib inline

#To supress scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
#There are a lot of issues with the 'position_two' column, so I left it out.
#Furthermore, the 'position_again' column is much more consistent and has all relevant win/place information  

fields = ["position_again","bf_odds","venue_name","date","market_name","condition","barrier","handicap_weight","last_five_starts","prize_money","sex","age","jockey","jockey_sex","trainer","days_since_last_run","overall_starts","overall_wins","overall_places","track_starts","track_wins","track_places","firm_starts","firm_wins","firm_places","good_starts","good_wins","good_places","slow_starts","slow_wins","slow_places","soft_starts","soft_wins","soft_places","heavy_starts","heavy_wins","heavy_places","distance_starts","distance_wins","distance_places"]

df = pd.read_csv("horses.csv", skipinitialspace=True, usecols=fields, low_memory=False)

df.head()

In [None]:
df.date = pd.to_datetime(df.date, format='%Y'+'-'+'%m'+'-'+'%d')

#removes numbers from end of 'condition' strings
df.condition = df.condition.str.replace('\d+', '')

#renaming condition values so that they're uniform
df.condition = df.condition.str.replace('HVY', 'HEAVY')
df.condition = df.condition.str.replace('AWT', 'GOOD') #AWT equates to a Good surface under some weather conditions

#reverses 'last_five_starts' (originally written right-to-left) so that it's easier to read in the future
df.last_five_starts = df.last_five_starts.str[::-1]

## Creating general and track-condition-specific 'win_percent' and 'place_percent' columns:

In [None]:
#creates overall, track, and distance win_percent
#and place_percent columns and drops existing wins and places columns

columns_list = ["overall","track","distance"]

for x in columns_list:
    df[x+"_win_percent"] = df[x+"_wins"]/df[x+"_starts"]
    
    df[x+"_place_percent"] = df[x+"_places"]/df[x+"_starts"]

    if x == "overall": # 'overall_starts' will be used later
        df.drop([x+'_wins', x+'_places'], axis=1, inplace=True)
    else:
        df.drop([x+'_starts', x+'_wins', x+'_places'], axis=1, inplace=True)

In [None]:
#creates a win_percent and place_percent column for the current condition of the track

df.loc[df.condition.isna(), "condition_win_percent"] = np.nan

condition_list = ["firm","good","slow","soft","heavy"]

for x in condition_list:    
    
    df.loc[df.condition.str.lower() == x, "condition_win_percent"] = df[x+"_wins"]/df[x+"_starts"]
    
    df.loc[df.condition.str.lower() == x, "condition_place_percent"] = df[x+"_places"]/df[x+"_starts"]
    
    df.drop([x+'_starts', x+'_wins', x+'_places'], axis=1, inplace=True)

# Condition column is not necessary now that there are condition win and place percent columns 
df.drop('condition', axis=1, inplace=True)

# Replaces infinity (zero division) with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

## Cleaning data by removing races with missing win and/or place values in 'position_again' column:

In [None]:
#Used groupby to create indices by which to sort the re-indexed dataframes below, like df_indexed and df_cleaned
df_grouped = df.groupby(['date','venue_name','market_name'])

#Drops all groups/races in 'position_again' column where sum of values [1st, 2nd, 3rd] don't add to 3 or 6
#i.e. 1+2 and 1+2+3
index_list1 = df_grouped.position_again.sum(dropna=False).where(lambda x:(x == 3) | (x == 6)).dropna().index

df_indexed = df.set_index(['date','venue_name','market_name'])

df_cleaned = df_indexed.loc[index_list1]

In [None]:
df_grouped = df_cleaned.groupby(['date','venue_name','market_name'])

#Eliminates remaining errors in 'position_again' column by making sure that there isn't a single 3rd-place finish
index_list2 = df_grouped.position_again.value_counts(normalize=True,dropna=False).where(lambda x:x != 1).dropna().index

In [None]:
df_cleaned = df_cleaned.loc[index_list2]

df_grouped = df_cleaned.groupby(['date','venue_name','market_name'])

## Creating a weight_minus_average column:

In [None]:
average_weight = df_grouped.handicap_weight.transform('mean')

df_cleaned.insert(4,'average_weight',average_weight)

df_cleaned.insert(5,'weight_minus_average', df_cleaned.handicap_weight - df_cleaned.average_weight)

In [None]:
df_cleaned.drop(['handicap_weight','average_weight'], axis=1, inplace=True)

In [None]:
df_grouped = df_cleaned.groupby(['date','venue_name','market_name'])

## Creating a prize_money_per_start_minus_average column:

In [None]:
#creates prize_money_per_start column
df_cleaned.insert(6,'prize_money_per_start', df_cleaned.prize_money/df_cleaned.overall_starts)

#Creates average_prize_money_per_start column
average_prize_money_per_start = df_grouped.prize_money_per_start.transform('mean')

df_cleaned.insert(7,'average_prize_money_per_start', average_prize_money_per_start)

#Creates prize_money_per_start_minus_average column
df_cleaned.insert(8,'prize_money_per_start_minus_average', df_cleaned.prize_money_per_start - df_cleaned.average_prize_money_per_start)

In [None]:
#drops irrelevant columns
df_cleaned.drop(['prize_money', 'prize_money_per_start', 'average_prize_money_per_start'], axis=1, inplace=True)

In [None]:
df_grouped = df_cleaned.groupby(['date','venue_name','market_name'])

## Creating a horse_age_minus_average column:

In [None]:
average_age = df_grouped.age.transform('mean')

df_cleaned.insert(8,'average_age',average_age)

df_cleaned.insert(9,'age_minus_average', df_cleaned.age - df_cleaned.average_age)

In [None]:
df_cleaned.drop(['age','average_age'], axis=1, inplace=True)

df_cleaned.drop('overall_starts', axis=1, inplace=True)

In [None]:
# Replaces infinity (zero division) with NaN
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)

df_cleaned.sort_index(inplace=True)

df_grouped = df_cleaned.groupby(['date','venue_name','market_name'])

df_cleaned.head()

# Testing which features are significant:

## For jockey gender:

#### Overall percentage of men and women in races where both are represented:

In [None]:
#Drops races where there is only one jockey gender, meaning that the other gender can't win
jockey_sex_population = df_grouped.jockey_sex.value_counts(normalize=True,dropna=False).where(lambda x: x != 1).dropna()

#Finds mean percent of jockey genders in races
jockey_sex_population.mean(level=3)

In [None]:
jockey_sex_population_indices = jockey_sex_population.index

jockey_sex_population_cleaned = df_cleaned.loc[jockey_sex_population_indices].drop_duplicates()

#### Finding the total a different way:

In [None]:
jockey_sex_population_cleaned.jockey_sex.value_counts(dropna=False)
#Yields same result as previous total:

#### Win percentage of those races:

In [None]:
#Isolates wins in races with both jockey genders represented
jockey_sex_win_indices = jockey_sex_population_cleaned.position_again.apply(lambda x: x == 1)

jockey_sex_population_cleaned[jockey_sex_win_indices].jockey_sex.value_counts(dropna=False)

#### Ensuring that races counted were same for both percentage values:

In [None]:
#Number of races for both previous percentages
#followed by the number of races there should be (given by number of first-place finishes): 

print(len(jockey_sex_population_cleaned.groupby(['date','venue_name','market_name'])))

print(len(jockey_sex_population_cleaned[jockey_sex_win_indices].jockey_sex.groupby(['date','venue_name','market_name'])))

jockey_sex_population_cleaned.position_again.value_counts()

### Using a 2-proportion z-test, I find that jockey gender is significant with a p-value of 2.3E-30
#### (There is currently a bug with the statsmodels library concering compatibility with scipy, so I used a scientific calculator)

## For horse gender:

In [None]:
#Removes races where only one horse gender is represented
horse_gender_population = df_grouped.sex.value_counts(normalize=True,dropna=False).where(lambda x: x != 1).dropna()

#Takes average percentage of each gender in races
#In this case, this method of finding the general horse gender distribution overrepresents genders that are rare 
horse_gender_population.mean(level=3)

In [None]:
horse_gender_population_indices = horse_gender_population.index

horse_gender_population_cleaned = df_cleaned.loc[horse_gender_population_indices].drop_duplicates()

In [None]:
#General percentage of horse genders for races where multiple genders are represented  
horse_gender_population_cleaned.sex.value_counts(dropna=False,normalize=True)

In [None]:
horse_gender_win_indices = horse_gender_population_cleaned.position_again.apply(lambda x: x == 1)

print('Total wins:', horse_gender_population_cleaned[horse_gender_win_indices].sex.value_counts(dropna=False).sum())

horse_gender_population_cleaned[horse_gender_win_indices].sex.value_counts(dropna=False)

### Using the Pearson's chi-squared test for both ways of finding population, I find horse gender is significant:

In [None]:
observed = [10015,4200,2447,715,187]
expected_percentages = [.57,.27,.13,.03,.01]
expected = [x*17564 for x in expected_percentages] #17564 removes the two anomalies (the Unknown and NaN genders)

test_stat1, p_value1 = chisquare(observed, expected)

test_stat1, p_value1

## For horse weight, using weight_minus_average:

In [None]:
df_cleaned.weight_minus_average.describe()

In [None]:
general_win_indices = df_grouped.position_again.apply(lambda x: x == 1)

#average weight difference for winners
df_cleaned[general_win_indices].weight_minus_average.describe()

In [None]:
data2a = df_cleaned.weight_minus_average.dropna().values

data2b = df_cleaned[general_win_indices].weight_minus_average.dropna().values

plt.title("Winner and Race Distributions of Weight from Race Average", fontsize=15)
plt.hist(data2a, density=True, bins=40, range=(-10,10), label='Race Average', color='b', alpha=.5, edgecolor='k')
plt.hist(data2b, density=True, bins=40, range=(-10,10), label='Winner Average', color='r', alpha=.5, edgecolor='k')
plt.legend(loc='upper right')
plt.xlabel('Weight from Average')
plt.ylabel('Probability');

#### 1 sample T-test:

### Using a 2-sample T-test, I find that horse weight is significant:

In [None]:
test_stat2, p_value2 = ttest_ind(data2a, data2b)

test_stat2, p_value2

## For prize money, using prize_money_per_start_minus_average:

In [None]:
df_cleaned.prize_money_per_start_minus_average.describe()

In [None]:
#Winner prize money 
df_cleaned[general_win_indices].prize_money_per_start_minus_average.describe()

In [None]:
data3a = df_cleaned.prize_money_per_start_minus_average.dropna().values

data3b = df_cleaned[general_win_indices].prize_money_per_start_minus_average.dropna().values

plt.title("Winner and Race Distributions of Prize Money per Start Minus Average", fontsize=15)
plt.hist(data3a, density=True, bins=40, range=(-10000,10000), label='Race Average', color='b', alpha=.6, edgecolor='k')
plt.hist(data3b, density=True, bins=40, range=(-10000,10000), label='Winner Average', color='r', alpha=.5, edgecolor='k')
plt.legend(loc='upper right')
plt.xlabel('Prize Money per Start Minus Average')
plt.ylabel('Probability');

### Using a 2-sample T-test, I find that prize money per start is significant:


In [None]:
test_stat3, p_value3 = ttest_ind(data3a, data3b)

test_stat3, p_value3

## For horse age, using horse_age_minus_average:

In [None]:
age_minus_average_population = df_grouped.age_minus_average.value_counts(normalize=True,dropna=False).where(lambda x: x != 1).dropna()

In [None]:
age_minus_average_population.describe()

In [None]:
age_minus_average_population_indices = age_minus_average_population.index

age_minus_average_population_cleaned = df_cleaned.loc[age_minus_average_population_indices].drop_duplicates()

In [None]:
age_minus_average_win_indices = jockey_sex_population_cleaned.position_again.apply(lambda x: x == 1)

df_cleaned[age_minus_average_win_indices].age_minus_average.describe()