# Auto-Gluon Notebook

This notebook loads and predicts draft data using one season of data to generate predictions on where and when someone will get drafted. 

This notebook is purely exporatory and much time should not be spent reviewing this. Auto-Gluon tests multiple models in order to identify which models are the best. These results are output in the `models` folder with the prefix "ag"

Formal writeup on the model performance and testing will be done in the `models/AutogluonModels/readme.md`

Developer: Kevin Kao 



In [None]:
import pandas as pd

df = pd.read_feather("draft_data.feather")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error
import autogluon.tabular as ag

# Load the data
df = pd.read_feather("draft_data.feather")
df["is_drafted"] = (df["overall"] > 0).astype(int)  # Create binary target

# Determine features (exclude identifiers and targets)
exclude_cols = ["player_id", "name", "overall"]
feature_cols = [col for col in df.columns if col not in exclude_cols]
X = df[feature_cols]
y_reg = df["overall"]  # For draft position prediction

# Convert categorical columns to string for AutoGluon
for col in X.select_dtypes(["category", "object"]).columns:
    X[col] = X[col].astype(str)

# Split data
X_train, X_test, y_reg_train, y_reg_test = train_test_split(
    X, y_reg, test_size=0.2, random_state=42, stratify=X["is_drafted"]
)

# Train classification model (will player be drafted?)
predictor_clf = ag.TabularPredictor(
    label="is_drafted", problem_type="binary", eval_metric="roc_auc"
).fit(
    train_data=X_train,
    time_limit=600,  # 10 minutes time limit
    presets="best_quality",  # Use high-quality models
)

# Create regression model for drafted players only
drafted_df = df[df["overall"] > 0].copy()
drafted_X = drafted_df[feature_cols].drop(columns=["is_drafted"])
drafted_y = drafted_df["overall"]

# Train regression model if we have drafted players
if len(drafted_df) > 0:
    # Convert categorical columns to string
    for col in drafted_X.select_dtypes(["category", "object"]).columns:
        drafted_X[col] = drafted_X[col].astype(str)

    # Split data
    drafted_X_train, drafted_X_test, drafted_y_train, drafted_y_test = train_test_split(
        drafted_X, drafted_y, test_size=0.2, random_state=42
    )

    # Train regression model
    predictor_reg = ag.TabularPredictor(
        label="overall", problem_type="regression", eval_metric="rmse"
    ).fit(
        train_data=pd.concat([drafted_X_train, drafted_y_train], axis=1),
        time_limit=600,
        presets="best_quality",
    )
else:
    predictor_reg = None
# Define the predict_player_draft function


# Define function to predict player draft
def predict_player_draft(player_data, clf_model, reg_model, categorical_columns):
    # Create a DataFrame with the player data
    player_data_processed = pd.DataFrame([player_data])

    # Apply the same preprocessing as during training
    for col in categorical_columns:
        if col in player_data_processed.columns:
            player_data_processed[col] = player_data_processed[col].astype(str)

    # Reset index to ensure consistent indexing
    player_data_processed = player_data_processed.reset_index(drop=True)

    # Handle prediction safely
    try:
        probas = clf_model.predict_proba(player_data_processed)
        # Check shape and access appropriate probability
        if probas.shape[1] >= 2:
            draft_prob = probas[0][1]  # Probability of positive class (being drafted)
        else:
            draft_prob = probas[0][0]  # Only one probability available
    except Exception as e:
        print(f"Error in prediction: {e}")
        # Fallback to direct prediction
        prediction = clf_model.predict(player_data_processed)[0]
        draft_prob = 1.0 if prediction == 1 else 0.0

    # Predict draft status
    will_be_drafted = clf_model.predict(player_data_processed)[0] == 1

    # Predict draft position only if player is predicted to be drafted
    draft_position = None
    if will_be_drafted:
        try:
            draft_position = reg_model.predict(player_data_processed)[0]
        except Exception as e:
            print(f"Error predicting position: {e}")
            draft_position = 0  # Default value

    # Get feature importance (player-specific explanation)
    feature_importance = {}
    try:
        if hasattr(clf_model, "feature_importances_"):
            importances = clf_model.feature_importances_
            for i, col in enumerate(player_data_processed.columns):
                feature_importance[col] = importances[i]
    except Exception as e:
        print(f"Error getting feature importance: {e}")

    return {
        "will_be_drafted": will_be_drafted,
        "draft_probability": draft_prob,
        "draft_position": draft_position if will_be_drafted else None,
        "feature_importance": feature_importance,
    }


# Example usage
# Define categorical columns (adjust based on your actual data)
categorical_columns = [
    "position",
    "school",
    "conference",
]  # Replace with your actual categorical columns

example_player = X_test.iloc[0].to_dict()
prediction = predict_player_draft(
    example_player, predictor_clf, predictor_reg, categorical_columns
)
print(f"Will be drafted: {prediction['will_be_drafted']}")
print(f"Draft probability: {prediction['draft_probability']:.2f}")
if prediction["draft_position"] is not None:
    print(f"Predicted draft position: {prediction['draft_position']:.1f}")