# Overview

This notebook will show you what features are important for getting to the draft

# Phase 1: Examining Drafted Players

Below players are players who were drafted in the 2024 season. 

In [None]:
import pandas as pd

df = pd.read_feather("draft_data.feather")

# Phase 2: Pre-Processing

This performs some rudimentary cleaning data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

# Load and preprocess data
df = pd.read_feather("draft_data.feather")

# Label first 256 players as drafted (1) and rest as undrafted (0)
df["drafted"] = 0
df.loc[df.index < 256, "drafted"] = 1

# Drop columns that would cause data leakage
leakage_cols = [
    "collegeAthleteId",
    "nflAthleteId",
    "collegeId",
    "nflTeamId",
    "nflTeam",
    "year",
    "overall",
    "round",
    "pick",
    "name",
    "hometownInfo",
]
df = df.drop(columns=leakage_cols, errors="ignore")

# Fill NaN values with 0
df = df.fillna(0)

# Split features and target
X = df.drop(columns=["drafted"])
y = df["drafted"]

# Define categorical and numerical features
categorical_features = ["position", "collegeTeam", "collegeConference"]
numerical_features = [col for col in X.columns if col not in categorical_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
        ("num", StandardScaler(), numerical_features),
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Process data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, max_depth=10, class_weight="balanced"
    ),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, max_depth=5),
}

results = []
for name, model in models.items():
    # Train and predict
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)

    # Calculate metrics
    results.append(
        {
            "Model": name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
        }
    )

# Display results
results_df = pd.DataFrame(results)
print("\nModel Performance:")
print(results_df)

# Plot results
plt.figure(figsize=(10, 6))
metrics = ["Accuracy", "Precision", "Recall", "F1"]
melted_results = pd.melt(
    results_df,
    id_vars="Model",
    value_vars=metrics,
    var_name="Metric",
    value_name="Value",
)
sns.barplot(x="Metric", y="Value", hue="Model", data=melted_results)
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()