In [None]:
from google.colab import drive
import os
import pandas as pd

# **Load Datasets**

In [None]:
drive.mount('/content/drive')
path = '/content/drive/My Drive/Horse'

In [None]:
def load_horse_data(path):
    all_race_data = []
    all_horse_data = []

    for year in range(1990, 2021):
        race_file = os.path.join(path, f"races_{year}.csv")
        horse_file = os.path.join(path, f"horses_{year}.csv")

        if os.path.exists(race_file):
            race_data = pd.read_csv(race_file)
            all_race_data.append(race_data)

        if os.path.exists(horse_file):
            horse_data = pd.read_csv(horse_file)
            all_horse_data.append(horse_data)

    race_data = pd.concat(all_race_data, ignore_index=True)
    horse_data = pd.concat(all_horse_data, ignore_index=True)

    forward_data = pd.read_csv(os.path.join(path, "forward.csv"))

    return race_data, horse_data, forward_data

race_data, horse_data, forward_data = load_horse_data(path)

**Merge Dataset**

In [None]:
merged_data = pd.merge(race_data, horse_data, on="rid", how="inner")

# **Data Preprocessing**

**Convert Datatypes**

In [None]:
merged_data.info()

merged_data['time'] = pd.to_datetime(merged_data['time'], format='%H:%M', errors='coerce').dt.time
merged_data['date'] = pd.to_datetime(merged_data['date'], format='%y/%m/%d', errors='coerce')

def convert_distance_to_furlongs(distance_str):
    if isinstance(distance_str, str):
        pattern = r"(\d+|\½)(m|f|½?)"
        parts = re.findall(pattern, distance_str)
        total_furlongs = 0
        for value, unit in parts:
            if unit == 'm':  # Miles
                total_furlongs += int(value) * 8
            elif unit == 'f':  # Furlongs
                # Check if value is '½', if so add 0.5, otherwise convert to int and add
                total_furlongs += 0.5 if value == '½' else int(value)
            elif value == '½':  # Fractional furlong
                total_furlongs += 0.5
        return total_furlongs
    return distance_str  # Return unchanged if not a string

merged_data['distance'] = merged_data['distance'].apply(convert_distance_to_furlongs)

merged_data.info()

**Handling Null Values**

In [None]:
merged_data.isnull().sum()

# Drop unwanted columns
# Horse Weight in three diff formats (kg, st, lb)
# Distance in two diff formats (m f, meter)
columns_to_drop = ['weightLb', 'weightSt', 'father', 'mother', 'gfather', 'distance']
merged_data.drop(columns=columns_to_drop, inplace=True)

# Drop columns with >80% missing values
columns_to_drop = ['currency', 'overWeight', 'outHandicap', 'headGear','price']
merged_data.drop(columns=columns_to_drop, inplace=True)

# Drop rows with <2% missing values
merged_data = merged_data.dropna(subset=['date', 'title', 'trainerName', 'jockeyName', 'age', 'condition'])

merged_data.isnull().sum()

**Encoding Categorical Variables**

In [None]:
# Columns for Label Encoding
from sklearn.preprocessing import LabelEncoder
label_columns = ['positionL', 'dist', 'rclass', 'ages', 'condition', 'countryCode', 'ncond']
model = LabelEncoder()
for col in label_columns:
    # Convert the column to string type before encoding
    merged_data[col] = merged_data[col].astype(str)
    merged_data[col] = model.fit_transform(merged_data[col])

# Columns for Frequency Encoding
freq_columns = ['course', 'band', 'hurdles', 'time', 'title', 'prizes',
                'horseName', 'trainerName', 'jockeyName']
for col in freq_columns:
    freq_map = merged_data[col].value_counts().to_dict()
    merged_data[col] = merged_data[col].map(freq_map)

merged_data.info()

# **Feature Engineering**

In [None]:
# Group by Horse Name to Aggregate Historical Data
historical_metrics = merged_data.groupby('horseName').agg(
    total_races=('rid', 'count'),
    total_wins=('res_win', 'sum'),
    total_places=('res_place', 'sum'),
    avg_position=('position', 'mean')
).reset_index()

historical_metrics['win_rate'] = historical_metrics['total_wins'] / historical_metrics['total_races']
historical_metrics['place_rate'] = historical_metrics['total_places'] / historical_metrics['total_races']

merged_data = merged_data.merge(historical_metrics, on='horseName', how='inner')

In [None]:
# Aggregate for Trainer
trainer_metrics = merged_data.groupby('trainerName').agg(
    trainer_races=('rid', 'count'),
    trainer_wins=('res_win', 'sum')
).reset_index()

trainer_metrics['trainer_success_rate'] = trainer_metrics['trainer_wins'] / trainer_metrics['trainer_races']

merged_data = merged_data.merge(trainer_metrics[['trainerName', 'trainer_success_rate']], on='trainerName', how='inner')

In [None]:
# Aggregate for Jockey
jockey_metrics = merged_data.groupby('jockeyName').agg(
    jockey_races=('rid', 'count'),
    jockey_wins=('res_win', 'sum')
).reset_index()

jockey_metrics['jockey_success_rate'] = jockey_metrics['jockey_wins'] / jockey_metrics['jockey_races']

merged_data = merged_data.merge(jockey_metrics[['jockeyName', 'jockey_success_rate']], on='jockeyName', how='inner')

In [None]:
# Track-Specific Performance
track_performance = merged_data.groupby(['horseName', 'course']).agg(
    avg_position_on_track=('position', 'mean'),
    win_rate_on_track=('res_win', 'mean')
).reset_index()

merged_data = merged_data.merge(track_performance, on=['horseName', 'course'], how='inner')

# **Exploratory Data Analysis (EDA)**

**Descriptive Statistics**

In [None]:
numerical_columns = ['prize', 'metric', 'winningTime', 'age', 'saddle', 'decimalPrice', 'RPR', 'TR', 'OR', 'runners', 'margin', 'weight', 'total_races',
                     'total_wins', 'total_places', 'avg_position', 'win_rate', 'place_rate', 'trainer_success_rate', 'jockey_success_rate',
                     'avg_position_on_track', 'win_rate_on_track']
merged_data[numerical_columns].describe().T

**Visualization**

In [None]:
# Distribution plots - Before Treatment
print('Before Treatment:')
def visualize_plots(df, columns):
    plt.figure(figsize=(20, 5))
    # Boxplot
    plt.subplot(1,3,1)
    sns.boxplot(data=df, x=columns)
    plt.title(f'Boxplot of {columns}')

    # Distribution Plot
    plt.subplot(1,3,2)
    sns.histplot(data=df, x=columns, kde=True, bins=50)
    plt.title(f'Distribution Plot of {columns}')

    # Violin Plot
    plt.subplot(1,3,3)
    sns.violinplot(data=df, x=columns)
    plt.title(f'Violin Plot of {columns}')

    plt.show()

for i in ['prize', 'metric', 'winningTime', 'age', 'saddle', 'decimalPrice', 'RPR', 'TR', 'OR', 'runners', 'margin', 'weight', 'total_races', 'total_wins',
          'total_places', 'avg_position', 'win_rate', 'place_rate', 'trainer_success_rate', 'jockey_success_rate', 'avg_position_on_track',
          'win_rate_on_track']:
          visualize_plots(merged_data, i)


# Skewness Handling
merged_data['prize_log'] = np.log(merged_data['prize'])

# Outliers Detection and Cap the Outliers
def detect_outliers_iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[columns] < lower_bound) | (df[columns] > upper_bound)].index
    print(f'{columns} outliers: {len(outliers)}')

    df[columns] = np.where(df[columns] < lower_bound, lower_bound, df[columns])
    df[columns] = np.where(df[columns] > upper_bound, upper_bound, df[columns])

for i in ['prize', 'metric', 'winningTime', 'age', 'saddle', 'decimalPrice', 'RPR', 'TR', 'OR', 'runners', 'margin', 'weight', 'total_races', 'total_wins',
          'total_places', 'avg_position', 'win_rate', 'place_rate', 'trainer_success_rate', 'jockey_success_rate', 'avg_position_on_track',
          'win_rate_on_track']:
          detect_outliers_iqr(merged_data, i)


# Distribution plots - After Treatment
print('After Treatment:')
def visualize_plots(df, columns):
    plt.figure(figsize=(20, 5))
    # Boxplot
    plt.subplot(1,3,1)
    sns.boxplot(data=df, x=columns)
    plt.title(f'Boxplot of {columns}')

    # Distribution Plot
    plt.subplot(1,3,2)
    sns.histplot(data=df, x=columns, kde=True, bins=50)
    plt.title(f'Distribution Plot of {columns}')

    # Violin Plot
    plt.subplot(1,3,3)
    sns.violinplot(data=df, x=columns)
    plt.title(f'Violin Plot of {columns}')

    plt.show()

for i in ['prize', 'metric', 'winningTime', 'age', 'saddle', 'decimalPrice', 'RPR', 'TR', 'OR', 'runners', 'margin', 'weight', 'total_races', 'total_wins',
          'total_places', 'avg_position', 'win_rate', 'place_rate', 'trainer_success_rate', 'jockey_success_rate', 'avg_position_on_track',
          'win_rate_on_track']:
          visualize_plots(merged_data, i)

**Correlation Analysis**

In [None]:
# Correlation matrix
corr_matrix = merged_data[['prize', 'metric', 'winningTime', 'age', 'saddle', 'decimalPrice', 'RPR', 'TR', 'OR',
                     'runners', 'margin', 'weight', 'total_races', 'total_wins',
                     'total_places', 'avg_position', 'win_rate', 'place_rate', 'trainer_success_rate',
                     'jockey_success_rate', 'avg_position_on_track', 'win_rate_on_track']].corr()
plt.figure(figsize=(20, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# **Modeling Approach**

In [None]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Load Preprocessed Dataset
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/horse_race_prediction_data.csv')
sample_df = df.sample(n=200000, random_state=42)

In [None]:
# Drop Unnecessary Columns
drop_columns = ['rid', 'title', 'date', 'time', 'positionL', 'dist']
sample_df = sample_df.drop(columns=drop_columns)

In [None]:
X = sample_df.drop(columns=['res_win', 'res_place'])
y = sample_df['res_win']

In [None]:
smote = SMOTE(random_state=42)
X_new, y_new = smote.fit_resample(X, y)

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

In [None]:
# Choosing Best Model - Balanced Data

models = [LogisticRegression(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          XGBClassifier(),
          ExtraTreesClassifier()]

for model in models:

    model.fit(X_train,y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print(f"*********{type(model).__name__}*********")
    print(f"Train Accuracy: {accuracy_score(y_train,train_pred)}")
    print(f"Train Precision: {precision_score(y_train,train_pred)}")
    print(f"Train Recall: {recall_score(y_train,train_pred)}")
    print(f"Train F1: {f1_score(y_train,train_pred)}")

    print(f"Test Accuracy: {accuracy_score(y_test,test_pred)}")
    print(f"Test Precision: {precision_score(y_test,test_pred)}")
    print(f"Test Recall: {recall_score(y_test,test_pred)}")
    print(f"Test F1: {f1_score(y_test,test_pred)} \n \n")

In [None]:
# RandomForestRegressor -  Easier Interpretability

# Finding best parameters
model = RandomForestClassifier()
params = {
    'n_estimators':[100, 150, 200],
    'max_features':['sqrt', 'log2', None],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
cv = GridSearchCV(model,params, n_jobs=-1, cv=5)
cv.fit(X_train,y_train)

In [None]:
cv.best_params_

In [None]:
cv.best_score_

In [None]:
# ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, test_pred)
auc_score = roc_auc_score(y_test, test_pred)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random performance
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve- Test Data")
plt.legend(loc="lower right")
plt.show()

# ROC curve and AUC score - Train Data
fpr, tpr, thresholds = roc_curve(y_train, train_pred)
auc_score = roc_auc_score(y_train, train_pred)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random performance
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve - Train Data")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Confusion matrix - Train
conf_matrix = confusion_matrix(y_train, train_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.show()

# Confusion matrix - Test
conf_matrix = confusion_matrix(y_test, test_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.show()