In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_excel("Horse_List_10_Jul_2025.xlsx")

In [3]:
targets = {
    'Win 1': 'WIN NEXT START',
    'Place 1': 'PLACE NEXT START',
    'Win 2': 'WIN 2nd START',
    'Place 2': 'PLACE 2nd START',
    'Win 3': 'WIN 3rd START',
    'Place 3': 'PLACE 3rd START',
    'Win 4': 'WIN 4th START',
    'Place 4': 'PLACE 4th START'
}

color_coded_features = {
    'WIN NEXT START': [...],
    'PLACE NEXT START': [...]
}

with pd.ExcelWriter("win_place_targets_color_coded.xlsx") as writer:
    for sheet_name, target_col in targets.items():
        features = color_coded_features.get(target_col, [])
        cols_to_include = features + [target_col]
        sheet_df = df[cols_to_include].copy()
        sheet_df.to_excel(writer, sheet_name=sheet_name, index=False)

# Blue-coded features and their related targets
blue_target_cols = ['WIN NEXT START', 'PLACE NEXT START']
blue_features = [...]  # fill this with the actual feature names marked in blue

df_blue = df[blue_features + blue_target_cols]
df_blue.to_excel("blue_features_win_place_today.xlsx", index=False)

IndexError: At least one sheet must be visible

In [49]:
target= [
    'WIN NEXT START',
    'WIN 2nd START',
    'WIN 3rd START',
    'PLACE NEXT START',
    'PLACE 2nd START',
    'PLACE 3rd START',
    'WIN 4th START',
    'PLACE 4th START'
]

# Only keep binary columns that are NOT in any target column
binary_cols = [
    col for col in df.columns
    if col not in target and df[col].dropna().isin([0, 1]).all()
]

In [50]:
df[binary_cols] = df[binary_cols].fillna(0)
df[target] = df[target].fillna(0)


In [51]:
for t in target:
    print(f"\n=== Training model for: {t} ===")
    
    # Drop rows where target is missing
    df_model = df.dropna(subset=[t]).copy()
    
    # Fill NaNs in the target column (if any still remain, though unlikely now)
    df_model[t] = df_model[t].fillna(0)
    
    # Split X and y
    X = df_model[binary_cols]
    y = df_model[t]

    # Skip target if it's only one class (e.g., all 0s)
    if y.nunique() < 2:
        print(f"Skipping {t} due to lack of class variation.")
        continue

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train logistic regression with class_weight
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"Accuracy: {acc:.3f}")
    print(f"AUC: {auc:.3f}")

    # Top 30 coefficients
    coef_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': model.coef_[0]
    })
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    top_30 = coef_df.sort_values(by='Abs_Coefficient', ascending=False).head(30)

    # Save to CSV
    filename = f"top_30_coefficients_{t.replace(' ', '_')}.csv"
    top_30.to_csv(filename, index=False)
    print(f"Saved top 30 features to: {filename}")


=== Training model for: WIN NEXT START ===


Accuracy: 0.642
AUC: 0.673
Saved top 30 features to: top_30_coefficients_WIN_NEXT_START.csv

=== Training model for: WIN 2nd START ===
Accuracy: 0.622
AUC: 0.613
Saved top 30 features to: top_30_coefficients_WIN_2nd_START.csv

=== Training model for: WIN 3rd START ===
Accuracy: 0.598
AUC: 0.617
Saved top 30 features to: top_30_coefficients_WIN_3rd_START.csv

=== Training model for: PLACE NEXT START ===
Accuracy: 0.618
AUC: 0.662
Saved top 30 features to: top_30_coefficients_PLACE_NEXT_START.csv

=== Training model for: PLACE 2nd START ===
Accuracy: 0.585
AUC: 0.626
Saved top 30 features to: top_30_coefficients_PLACE_2nd_START.csv

=== Training model for: PLACE 3rd START ===
Accuracy: 0.589
AUC: 0.621
Saved top 30 features to: top_30_coefficients_PLACE_3rd_START.csv

=== Training model for: WIN 4th START ===
Accuracy: 0.604
AUC: 0.576
Saved top 30 features to: top_30_coefficients_WIN_4th_START.csv

=== Training model for: PLACE 4th START ===
Accuracy: 0.591
AUC: 0.612
Saved top 30 featu