This will detail an AI model to utilise old combine stats from drafted prospects to help predict the performance and likelihood of a current draft prospect

In [12]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split



In [13]:
# Load the dataset
combine_data = pd.read_csv('/content/drive/MyDrive/Battle of The Brains/tech/combine.csv')

# Display basic information and the first few rows of the dataset
combine_info = combine_data.info()
combine_head = combine_data.head()

combine_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4947 entries, 0 to 4946
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               4947 non-null   int64  
 1   name               4947 non-null   object 
 2   firstname          4947 non-null   object 
 3   lastname           4947 non-null   object 
 4   position           4947 non-null   object 
 5   heightfeet         4947 non-null   int64  
 6   heightinches       4947 non-null   float64
 7   heightinchestotal  4947 non-null   float64
 8   weight             4947 non-null   int64  
 9   arms               4947 non-null   float64
 10  hands              4947 non-null   float64
 11  fortyyd            4947 non-null   float64
 12  twentyyd           4947 non-null   float64
 13  tenyd              4947 non-null   float64
 14  twentyss           4947 non-null   float64
 15  threecone          4947 non-null   float64
 16  vertical           4947 

In [14]:
combine_head

Unnamed: 0,year,name,firstname,lastname,position,heightfeet,heightinches,heightinchestotal,weight,arms,...,vertical,broad,bench,round,college,pick,pickround,picktotal,wonderlic,nflgrade
0,2015,Ameer Abdullah,Ameer,Abdullah,RB,5,9.0,69.0,205,0.0,...,42.5,130,24,0,Nebraska,,0,0,0,5.9
1,2015,Nelson Agholor,Nelson,Agholor,WR,6,0.0,72.0,198,0.0,...,0.0,0,12,0,USC,,0,0,0,5.6
2,2015,Jay Ajayi,Jay,Ajayi,RB,6,0.0,72.0,221,0.0,...,39.0,121,19,0,Boise St.,,0,0,0,6.0
3,2015,Kwon Alexander,Kwon,Alexander,OLB,6,1.0,73.0,227,0.0,...,36.0,121,24,0,LSU,,0,0,0,5.4
4,2015,Mario Alford,Mario,Alford,WR,5,8.0,68.0,180,0.0,...,34.0,121,13,0,West Virginia,,0,0,0,5.3


In [15]:
# Basic statistics of the dataset
basic_stats = combine_data.describe()

# Checking for missing values
missing_values = combine_data.isnull().sum()

basic_stats



Unnamed: 0,year,heightfeet,heightinches,heightinchestotal,weight,arms,hands,fortyyd,twentyyd,tenyd,twentyss,threecone,vertical,broad,bench,round,pickround,picktotal,wonderlic,nflgrade
count,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0,4947.0
mean,2007.132201,5.805337,4.371437,74.035476,245.579745,7.640243,2.252426,4.610386,0.073734,0.129149,3.298106,1.503002,28.741257,95.944006,15.723873,2.435415,11.318981,71.879523,1.144532,0.700627
std,5.029664,0.395981,3.279223,2.614778,45.639366,13.801035,4.070448,0.974087,0.43233,0.436941,1.907526,2.929683,11.596749,41.82634,10.840896,2.476746,12.26222,79.417514,5.524047,1.81387
min,1999.0,5.0,0.0,65.0,155.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2003.0,6.0,2.0,73.0,208.0,0.0,0.0,4.53,0.0,0.0,3.835,0.0,28.0,101.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2007.0,6.0,4.0,74.0,237.0,0.0,0.0,4.69,0.0,0.0,4.24,0.0,32.5,112.0,18.0,2.0,7.0,42.0,0.0,0.0
75%,2012.0,6.0,6.0,76.0,289.0,0.0,0.0,4.99,0.0,0.0,4.47,0.0,35.5,119.0,24.0,5.0,21.0,134.0,0.0,0.0
max,2015.0,6.0,11.875,82.0,386.0,37.75,11.375,6.05,2.98,1.92,5.56,8.31,46.0,147.0,51.0,8.0,53.0,262.0,48.0,7.5


In [16]:
missing_values

year                    0
name                    0
firstname               0
lastname                0
position                0
heightfeet              0
heightinches            0
heightinchestotal       0
weight                  0
arms                    0
hands                   0
fortyyd                 0
twentyyd                0
tenyd                   0
twentyss                0
threecone               0
vertical                0
broad                   0
bench                   0
round                   0
college              1470
pick                 1791
pickround               0
picktotal               0
wonderlic               0
nflgrade                0
dtype: int64

In [20]:

# Dropping columns with excessive missing values and non-relevant columns
combine_data_cleaned = combine_data.drop(columns=['college', 'pick', 'name', 'firstname', 'lastname'])

# One-hot encoding the 'position' column
encoder = OneHotEncoder(sparse=False)
position_encoded = encoder.fit_transform(combine_data_cleaned[['position']])

# Creating a DataFrame with the encoded position data
position_encoded_df = pd.DataFrame(position_encoded, columns=encoder.get_feature_names_out(['position']))

# Concatenating the encoded position data with the original dataset
combine_data_encoded = pd.concat([combine_data_cleaned.reset_index(drop=True), position_encoded_df.reset_index(drop=True)], axis=1)

# Dropping the original 'position' column as it's now encoded
combine_data_encoded = combine_data_encoded.drop(columns=['position'])

# Display the first few rows of the dataset with encoded position
combine_data_encoded.head()





Unnamed: 0,year,heightfeet,heightinches,heightinchestotal,weight,arms,hands,fortyyd,twentyyd,tenyd,...,position_OC,position_OG,position_OLB,position_OT,position_P,position_QB,position_RB,position_SS,position_TE,position_WR
0,2015,5,9.0,69.0,205,0.0,0.0,4.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2015,6,0.0,72.0,198,0.0,0.0,4.42,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2015,6,0.0,72.0,221,0.0,0.0,4.57,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2015,6,1.0,73.0,227,0.0,0.0,4.55,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015,5,8.0,68.0,180,0.0,0.0,4.43,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:

# Initializing the StandardScaler
scaler = StandardScaler()

# Selecting the numeric columns for scaling (excluding the one-hot encoded position columns)
numeric_columns = combine_data_encoded.columns[:-len(encoder.get_feature_names_out(['position']))]

# Scaling the numeric columns
numeric_data_scaled = scaler.fit_transform(combine_data_encoded[numeric_columns])

# Creating a DataFrame with the scaled data
numeric_data_scaled_df = pd.DataFrame(numeric_data_scaled, columns=numeric_columns)

# Concatenating the scaled numeric data with the one-hot encoded position data
final_data = pd.concat([numeric_data_scaled_df, combine_data_encoded[encoder.get_feature_names_out(['position'])].reset_index(drop=True)], axis=1)

# Display the first few rows of the final dataset
final_data.head()

Unnamed: 0,year,heightfeet,heightinches,heightinchestotal,weight,arms,hands,fortyyd,twentyyd,tenyd,...,position_OC,position_OG,position_OLB,position_OT,position_P,position_QB,position_RB,position_SS,position_TE,position_WR
0,1.564437,-2.033979,1.411624,-1.92597,-0.889229,-0.553655,-0.553417,-0.010663,-0.170567,-0.295605,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.564437,0.491647,-1.333206,-0.77853,-1.042621,-0.553655,-0.553417,-0.195471,-0.170567,-0.295605,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.564437,0.491647,-1.333206,-0.77853,-0.538619,-0.553655,-0.553417,-0.041465,-0.170567,-0.295605,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.564437,0.491647,-1.028225,-0.396049,-0.40714,-0.553655,-0.553417,-0.061999,-0.170567,-0.295605,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.564437,-2.033979,1.106643,-2.308451,-1.437057,-0.553655,-0.553417,-0.185204,-0.170567,-0.295605,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:

# Creating the target variable (1 if the player was drafted, 0 otherwise)
final_data['drafted'] = (final_data['round'] > 0).astype(int)

# Dropping the 'round' column as it's now represented in the 'drafted' column
final_data = final_data.drop(columns=['round'])

# Splitting the data into features (X) and target (y)
X = final_data.drop(['drafted', 'picktotal', 'pickround'], axis=1)
y = final_data['drafted']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirming the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape




((3957, 39), (990, 39), (3957,), (990,))

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score

# Initializing models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Dictionary to hold accuracy and ROC-AUC scores
model_performance = {}

# Training and evaluating each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate accuracy and ROC-AUC score
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Store the performance
    model_performance[model_name] = {"Accuracy": accuracy, "ROC-AUC": roc_auc}

model_performance



{'Logistic Regression': {'Accuracy': 0.5454545454545454,
  'ROC-AUC': 0.5605210691717636},
 'Decision Tree': {'Accuracy': 0.5565656565656566,
  'ROC-AUC': 0.5481659514074274},
 'Random Forest': {'Accuracy': 0.5868686868686869,
  'ROC-AUC': 0.6096226303517724},
 'SVM': {'Accuracy': 0.5808080808080808, 'ROC-AUC': 0.5796810056432266},
 'Gradient Boosting': {'Accuracy': 0.5787878787878787,
  'ROC-AUC': 0.6186281384111108}}

In [27]:
# Checking the balance in the target variable 'drafted'
class_balance = y_train.value_counts(normalize=True)

class_balance



0    0.553197
1    0.446803
Name: drafted, dtype: float64