In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# === Step 1: Load Data ===
df = pd.read_excel("Horse_List_10_Jul_2025.xlsx")

# Extract target columns (last 11)
target_df = df.iloc[:, -11:]
target_df.to_csv("target_columns.csv", index=False)
target_cols = target_df.columns.tolist()

# Keep only numerical features
df = df.select_dtypes(include='number')

# Drop columns with more than 50% missing values
threshold = 0.5 * len(df)
df = df.dropna(axis=1, thresh=len(df) - threshold)

# Drop specific unwanted columns
if 'SPW' in df.columns:
    df = df.drop(columns=['SPW'])

# Drop rows with any missing values
df = df.dropna()

print("Remaining rows after dropping missing values:", len(df))
print("Total missing values:", df.isna().sum().sum())

Remaining rows after dropping missing values: 12869
Total missing values: 0


In [4]:
X = df.drop(columns=target_cols)
y = df[target_cols]

In [9]:
# Sanitize column names to remove invalid characters for XGBoost
X.columns = [col.replace('[', '')
                  .replace(']', '')
                  .replace('<', '')
                  .replace('>', '')
                  .replace(' ', '_') for col in X.columns]

# Sanitize feature column names
X.columns = [col.replace('[', '')
                  .replace(']', '')
                  .replace('<', '')
                  .replace('>', '')
                  .replace(' ', '_') for col in X.columns]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
logreg_accuracy = {}
xgb_accuracy = {}

logreg_reports = {}
xgb_reports = {}


In [12]:
for target in target_cols:
    print(f"Training for target: {target}")

    # --- Logistic Regression ---
    logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
    logreg.fit(X_train_scaled, y_train[target])
    log_preds = logreg.predict(X_test_scaled)

    acc_log = accuracy_score(y_test[target], log_preds)
    logreg_accuracy[target] = acc_log
    logreg_reports[target] = classification_report(y_test[target], log_preds, output_dict=True)

    # --- XGBoost ---
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train[target])  # No scaling needed
    xgb_preds = xgb.predict(X_test)

    acc_xgb = accuracy_score(y_test[target], xgb_preds)
    xgb_accuracy[target] = acc_xgb
    xgb_reports[target] = classification_report(y_test[target], xgb_preds, output_dict=True)

# === Step 5: Output Accuracies ===
accuracy_df = pd.DataFrame({
    'Logistic Regression': logreg_accuracy,
    'XGBoost': xgb_accuracy
})
accuracy_df.to_csv("model_accuracies.csv")
print("\n✅ Accuracy comparison saved to 'model_accuracies.csv'")

# === Step 6: Save Classification Reports (Optional) ===
def flatten_report(report_dict, model_name):
    rows = []
    for target, metrics in report_dict.items():
        for cls in ['0', '1']:
            row = {
                "Model": model_name,
                "Target": target,
                "Class": cls,
                "Precision": metrics[cls]["precision"],
                "Recall": metrics[cls]["recall"],
                "F1-Score": metrics[cls]["f1-score"],
                "Support": metrics[cls]["support"]
            }
            rows.append(row)
    return pd.DataFrame(rows)

df_log = flatten_report(logreg_reports, "LogisticRegression")
df_xgb = flatten_report(xgb_reports, "XGBoost")

final_report = pd.concat([df_log, df_xgb], ignore_index=True)
final_report.to_csv("classification_reports.csv", index=False)
print("📄 Classification reports saved to 'classification_reports.csv'")


Training for target: Win Today


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: Place Today


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: Unplace Today


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: WIN NEXT START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: WIN 2nd START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: WIN 3rd START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: WIN 4th START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: PLACE NEXT START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: PLACE 2nd START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: PLACE 3rd START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training for target: PLACE 4th START


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Accuracy comparison saved to 'model_accuracies.csv'
📄 Classification reports saved to 'classification_reports.csv'
