In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [2]:
dff = pd.read_excel("Horse_List_10_Jul_2025.xlsx")

In [4]:
# Check class distribution
print(dff['Win Today'].value_counts(normalize=True))  # Shows ratio
print(dff['Win Today'].value_counts())  # Shows actual counts


Win Today
0    0.896126
1    0.103874
Name: proportion, dtype: float64
Win Today
0    13786
1     1598
Name: count, dtype: int64


In [6]:
dff.head()

Unnamed: 0,Horse Name,Date,CVS,Unnamed: 3,Unnamed: 4,Distance,Dr,G,Sectional Time,Unnamed: 9,...,Place Today,Unplace Today,WIN NEXT START,WIN 2nd START,WIN 3rd START,WIN 4th START,PLACE NEXT START,PLACE 2nd START,PLACE 3rd START,PLACE 4th START
0,A AMERIC TE SPECSO (H037),06-Nov-2022,4 S / C+3,,,1200,3.0,G,4,2,...,0,1,0,0,0,0,0,0,1,0
1,A AMERIC TE SPECSO (H037),20-Nov-2022,4 S / B+2,,,1400,6.0,GF,3,2,...,0,1,0,0,0,0,0,1,0,0
2,A AMERIC TE SPECSO (H037),21-Dec-2022,4 H / C,,,1200,4.0,GF,6,5,...,0,1,0,0,0,1,1,0,0,1
3,A AMERIC TE SPECSO (H037),08-Jan-2023,4 S / C+3,,,1200,10.0,G,9,8,...,1,0,0,0,1,0,0,0,1,0
4,A AMERIC TE SPECSO (H037),24-Jan-2023,4 S / C,EP,,1200,5.0,G,4,2,...,0,1,0,1,0,0,0,1,0,0


In [None]:

# Select the last 11 columns
df_last11 = dff.iloc[:, -11:]

# Optionally, save this new dataset
df_last11.to_csv("target_columns.csv", index=False)

In [17]:
dff = dff.select_dtypes(include='number')

In [24]:
# Drop columns with more than 50% missing values from dff
threshold = 0.5 * len(dff)
dff_cleaned = dff.dropna(axis=1, thresh=len(dff) - threshold)

In [27]:
dff_cleaned = dff_cleaned.drop(columns=['SPW'])

In [31]:
dff_cleaned.shape

(15384, 318)

In [32]:
# Drop rows with any missing values
dff_cleaned = dff_cleaned.dropna()

# Confirm the number of remaining rows and any missing values
print("Remaining rows after dropping missing values:", len(dff_cleaned))
print("Total missing values after dropping rows:", dff_cleaned.isna().sum().sum())

Remaining rows after dropping missing values: 12869
Total missing values after dropping rows: 0


In [33]:
dff_cleaned.shape

(12869, 318)

In [34]:
target_df = pd.read_csv("target_columns.csv")
target_cols = target_df.columns.tolist()

In [39]:
# Redefine X and y using the cleaned dataset
X = dff_cleaned.drop(columns=target_cols)  # replace with your actual target columns list
y = dff_cleaned[target_cols]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
model_coefficients = {}
model_predictions = {}

In [48]:
for target in y.columns:
    lr = LogisticRegression(max_iter=1000, class_weight='balanced')
    lr.fit(X_train_scaled, y_train[target])

    # Store coefficients
    coeffs = pd.Series(lr.coef_[0], index=X.columns)
    model_coefficients[target] = coeffs

    # Predict probabilities
    preds = lr.predict_proba(X_test_scaled)[:, 1]
    model_predictions[target] = preds

# Display top 10 coefficients per target
for target, coeffs in model_coefficients.items():
    print(f"\n🧠 Top 10 coefficients for '{target}':")
    print(coeffs.abs().sort_values(ascending=False).head(10))


🧠 Top 10 coefficients for 'Win Today':
Unnamed: 12                      5.161784
H600 => WINNER                   3.440350
H200 => WINNER                   3.193039
Unnamed: 10                      2.570758
Sectional Time                   2.072077
Unnamed: 11                      1.674115
Winner by 2 length               1.542904
Make up position Stretch >= 9    1.463144
Unnamed: 9                       1.269941
Early Profile                    1.256747
dtype: float64

🧠 Top 10 coefficients for 'Place Today':
Unnamed: 12                       10.985554
Unnamed: 10                        5.462039
Make up position Stretch <= -3     2.307830
Make up position Stretch >= 9      2.119924
Make up position Stretch <= -4     2.083191
Sectional Time                     1.989502
Make up position Stretch >= 8      1.536009
Make up position Stretch >= 7      1.449316
Make up position Stretch <= -2     1.390233
Make up position Stretch >= 6      1.380826
dtype: float64

🧠 Top 10 coefficients for '

In [50]:
# Create a DataFrame to hold all features
all_features = set()
for coeffs in model_coefficients.values():
    all_features.update(coeffs.abs().sort_values(ascending=False).head(30).index)
all_features = list(all_features)

# Initialize a DataFrame with all features as rows
top_30_matrix = pd.DataFrame(index=all_features)

# For each target, get top 30 features and fill their coefficients
for target, coeffs in model_coefficients.items():
    top_30 = coeffs.abs().sort_values(ascending=False).head(30).index
    top_30_matrix[target] = coeffs[top_30_matrix.index]  # will assign NaN where not in top 30

# Optionally sort the index alphabetically or leave as is
top_30_matrix = top_30_matrix.sort_index()

# Save to CSV
top_30_matrix.to_csv("top_30_coefficients.csv")

In [51]:
from sklearn.metrics import accuracy_score

# Store accuracies
model_accuracies = {}

# Loop through each target column
for target in y.columns:
    lr = LogisticRegression(max_iter=1000, class_weight='balanced')
    lr.fit(X_train_scaled, y_train[target])

    # Predict on test set
    y_pred = lr.predict(X_test_scaled)

    # Calculate accuracy
    acc = accuracy_score(y_test[target], y_pred)
    model_accuracies[target] = acc

# Convert to DataFrame for better display
accuracy_df = pd.DataFrame.from_dict(model_accuracies, orient='index', columns=['Accuracy'])
print(accuracy_df.sort_values(by='Accuracy', ascending=False))


                  Accuracy
Place Today       0.995726
Unplace Today     0.995726
Win Today         0.994172
WIN NEXT START    0.626263
WIN 3rd START     0.620824
WIN 4th START     0.613442
PLACE NEXT START  0.608003
WIN 2nd START     0.604895
PLACE 3rd START   0.591686
PLACE 2nd START   0.590909
PLACE 4th START   0.581974


In [52]:
from sklearn.metrics import classification_report

for target in y.columns:
    lr = LogisticRegression(max_iter=1000, class_weight='balanced')
    lr.fit(X_train_scaled, y_train[target])
    y_pred = lr.predict(X_test_scaled)
    print(f"Classification report for {target}:\n")
    print(classification_report(y_test[target], y_pred))
    print("="*50)


Classification report for Win Today:

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2305
           1       0.95      1.00      0.97       269

    accuracy                           0.99      2574
   macro avg       0.97      1.00      0.98      2574
weighted avg       0.99      0.99      0.99      2574

Classification report for Place Today:

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1796
           1       0.99      1.00      0.99       778

    accuracy                           1.00      2574
   macro avg       0.99      1.00      0.99      2574
weighted avg       1.00      1.00      1.00      2574

Classification report for Unplace Today:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       778
           1       1.00      0.99      1.00      1796

    accuracy                           1.00      2574
   macro