In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn import metrics

In [2]:
#reading in the files for college stats and rookie season stats
qb_college = pd.read_csv('qb_college.csv')
rookie_season = pd.read_csv('Rookie_QB_Dataset.csv')

In [3]:
#looking at the top of the dataframe
qb_college.head()

Unnamed: 0,QB,Year,Round,Years Played,Cmp,Att,Pct,Yds,Y/A,AY/A,TD,Int,Rate
0,Malik Willis,2022,3,4,388,618,62.8,5176,8.4,8.6,48,18,152.9
1,Kenny Pickett,2022,1,5,1045,1674,62.4,12303,7.3,7.5,81,32,136.3
2,Sam Howell,2022,5,3,713,1117,63.8,10283,9.2,9.9,92,23,164.2
3,Desmond Ridder,2022,3,4,810,1304,62.1,10239,7.9,8.2,87,28,145.8
4,Matt Corral,2022,3,4,614,912,67.3,8287,9.1,9.2,57,23,159.2


In [4]:
#looking at the top of the dataframe
rookie_season.head()

Unnamed: 0,NAME,SEASON,TM,GP,CMP,ATT,PCT,YDS,AVG,YDS/G,TD,TD%,INT,INT%,SCK,SCKY,RATE
0,Bryce Perkins,2022,LAR,5,19,34,55.9,161,4.7,32.2,1,2.9,2,5.9,6,50,53.7
1,Kyle Trask,2022,TB,1,3,9,33.3,23,2.6,23.0,0,0.0,0,0.0,0,0,42.4
2,Sam Howell,2022,WAS,1,11,19,57.9,169,8.9,169.0,1,5.3,1,5.3,3,11,83.0
3,Skylar Thompson,2022,MIA,7,60,105,57.1,534,5.1,76.3,1,1.0,3,2.9,6,40,62.2
4,Bailey Zappe,2022,NE,4,65,92,70.7,781,8.5,195.3,5,5.4,3,3.3,6,40,100.9


In [4]:
#determining the average rate of the rookie quarterbacks
avg_rookie_rate = np.mean(rookie_season["RATE"])
avg_rookie_rate

70.32047244094488

In [5]:
#determining which quarterbacks above average in rookie season
above_avg = []

#looping through each player and if they are above average and have over 100 attempts, considered successful
for i in range(len(rookie_season)):
    if (rookie_season["RATE"][i] > avg_rookie_rate and rookie_season["ATT"][i] > 100):
        # 1 is used for successful rookie season
        above_avg.append(1)
    else:
        # 0 is used for unsuccessful rookie season
        above_avg.append(0)
       
#creating a new column for the success value    
rookie_season['SUCCESS'] = above_avg
rookie_season.head()

Unnamed: 0,NAME,SEASON,TM,GP,CMP,ATT,PCT,YDS,AVG,YDS/G,TD,TD%,INT,INT%,SCK,SCKY,RATE,SUCCESS
0,Bryce Perkins,2022,LAR,5,19,34,55.9,161,4.7,32.2,1,2.9,2,5.9,6,50,53.7,0
1,Kyle Trask,2022,TB,1,3,9,33.3,23,2.6,23.0,0,0.0,0,0.0,0,0,42.4,0
2,Sam Howell,2022,WAS,1,11,19,57.9,169,8.9,169.0,1,5.3,1,5.3,3,11,83.0,0
3,Skylar Thompson,2022,MIA,7,60,105,57.1,534,5.1,76.3,1,1.0,3,2.9,6,40,62.2,0
4,Bailey Zappe,2022,NE,4,65,92,70.7,781,8.5,195.3,5,5.4,3,3.3,6,40,100.9,0


In [6]:
#adding the success values to the college stats dataframe
add_success = []

#looping through each of the quarterbacks college stats dataframe
for i in range(len(qb_college)):
    got_value = 0
    # looping through each of the quarterbacks in the rookie season dataframe
    for j in range(len(rookie_season)):
        #making sure the correct value is being used for the correct quarterback
        if (qb_college['QB'][i] == rookie_season["NAME"][j]):
            add_success.append(rookie_season["SUCCESS"][j])
            got_value += 1
            break
    # if the player was not found in the rookie season dataframe give them a value of 0
    if (got_value < 1):
        add_success.append(0)
#adding the success column to college stats dataframe
qb_college["Success"] = add_success
qb_college.head()

Unnamed: 0,QB,Year,Round,Years Played,Cmp,Att,Pct,Yds,Y/A,AY/A,TD,Int,Rate,Success
0,Malik Willis,2022,3,4,388,618,62.8,5176,8.4,8.6,48,18,152.9,0
1,Kenny Pickett,2022,1,5,1045,1674,62.4,12303,7.3,7.5,81,32,136.3,1
2,Sam Howell,2022,5,3,713,1117,63.8,10283,9.2,9.9,92,23,164.2,0
3,Desmond Ridder,2022,3,4,810,1304,62.1,10239,7.9,8.2,87,28,145.8,1
4,Matt Corral,2022,3,4,614,912,67.3,8287,9.1,9.2,57,23,159.2,0


# Models

In [27]:
#getting the quarterback names
qbs = qb_college['QB']

#getting the success values separate
successful = qb_college['Success']

#dropping the columns that won't be needed for the model features
college_stats = qb_college.drop(columns=['QB', 'Success'])
college_stats

Unnamed: 0,Year,Round,Years Played,Cmp,Att,Pct,Yds,Y/A,AY/A,TD,Int,Rate
0,2022,3,4,388,618,62.8,5176,8.4,8.6,48,18,152.9
1,2022,1,5,1045,1674,62.4,12303,7.3,7.5,81,32,136.3
2,2022,5,3,713,1117,63.8,10283,9.2,9.9,92,23,164.2
3,2022,3,4,810,1304,62.1,10239,7.9,8.2,87,28,145.8
4,2022,3,4,614,912,67.3,8287,9.1,9.2,57,23,159.2
...,...,...,...,...,...,...,...,...,...,...,...,...
113,2012,3,3,938,1404,66.8,10068,7.2,7.1,67,33,138.1
114,2012,4,3,723,1128,64.1,9131,8.1,8.1,66,30,146.1
115,2012,6,4,961,1732,55.5,12690,7.3,7.1,90,47,128.0
116,2012,7,1,4,8,50.0,21,2.6,-3.0,0,1,47.1


In [8]:
#splitting the college dataframe into testing and training data
train_vectors, test_vectors, train_labels, test_labels = train_test_split(college_stats, successful, test_size = 0.25, train_size = 0.75)

In [9]:
#creating logistic regression model and printing out results
logistic_model = sm.Logit(train_labels, sm.add_constant(train_vectors))
logistic_result = logistic_model.fit()
print(logistic_result.summary())

Optimization terminated successfully.
         Current function value: 0.367587
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                Success   No. Observations:                   88
Model:                          Logit   Df Residuals:                       75
Method:                           MLE   Df Model:                           12
Date:                Sun, 16 Apr 2023   Pseudo R-squ.:                  0.4123
Time:                        09:33:36   Log-Likelihood:                -32.348
converged:                       True   LL-Null:                       -55.043
Covariance Type:            nonrobust   LLR p-value:                 8.830e-06
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const          465.1984    269.123      1.729      0.084     -62.274     992.671
Year            -0.243

In [11]:
#using the logistic regression model to predict the test values
logistic_predict = round(logistic_result.predict(sm.add_constant(test_vectors)))
true_labels = test_labels

# printing out the classification report and confusion matrix of the logistic model
print(classification_report(true_labels, logistic_predict))
print(confusion_matrix(true_labels, logistic_predict))

              precision    recall  f1-score   support

           0       0.73      0.76      0.74        21
           1       0.38      0.33      0.35         9

    accuracy                           0.63        30
   macro avg       0.55      0.55      0.55        30
weighted avg       0.62      0.63      0.63        30

[[16  5]
 [ 6  3]]


# SVM

In [22]:
#creating the SVM model through parameter fitting
tmp_vectors = train_vectors
tmp_labels = train_labels

print("Fitting the classifier to the training set")
# a dictionary of hyperparameters: key is the name of the parameter, value is a list of values to test
param_grid = {'C': [0.001, 0.01, 0.1, 1.0, 10.0],
              'gamma': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7],
              'kernel': ['linear','rbf']}
# make a classifier by searching over a classifier and the parameter grid
clf = GridSearchCV(SVC(class_weight='balanced'), param_grid)

# printing out the different parameters
clf = clf.fit(tmp_vectors, tmp_labels)
print("Best estimator found by grid search:")
print(clf.best_estimator_)
print("Best parameters found by grid search:")
print(clf.best_params_)

Fitting the classifier to the training set
Best estimator found by grid search:
SVC(C=0.01, class_weight='balanced', gamma=1e-11, kernel='linear')
Best parameters found by grid search:
{'C': 0.01, 'gamma': 1e-11, 'kernel': 'linear'}


In [23]:
#using the SVM model to predict the test values
svm_predict_vectors = test_vectors
svm_true_labels = test_labels

print("Predicting names on the test set")
svm_pred_labels = clf.predict(svm_predict_vectors)

#printing out the classification report and confusion matrix of the SVM model
print(classification_report(svm_true_labels, svm_pred_labels))
print(confusion_matrix(svm_true_labels, svm_pred_labels))

Predicting names on the test set
              precision    recall  f1-score   support

           0       0.72      0.62      0.67        21
           1       0.33      0.44      0.38         9

    accuracy                           0.57        30
   macro avg       0.53      0.53      0.52        30
weighted avg       0.61      0.57      0.58        30

[[13  8]
 [ 5  4]]


# Logistic Prediction

In [33]:
#reading in file with quarterbacks to be drafted college stats
qbs_23 = pd.read_csv('23 Draft QBs.csv')
#getting the quarterbacks' names
qbs_23_names = qbs_23["QB"]
#creating the dataframe of the features for the model
qbs_23_vectors = qbs_23.drop(columns=["QB"])
qbs_23_vectors

Unnamed: 0,Year,Round,Years Played,Cmp,Att,Pct,Yds,Y/A,AY/A,TD,Int,Rate
0,2023,1,2,575,830,69.3,8123,9.8,11.2,85,12,182.4
1,2023,1,2,624,949,65.8,8356,8.8,9.9,80,12,165.0
2,2023,1,2,215,393,54.7,3105,7.9,7.4,24,15,133.6
3,2023,1,3,479,738,64.9,5876,8.0,7.7,46,25,145.6
4,2023,2,4,632,944,66.9,8974,9.5,10.6,80,12,172.2
5,2023,3,4,826,1239,66.7,9219,7.4,7.4,65,30,141.6
6,2023,4,2,473,748,63.2,5336,7.1,7.0,28,15,131.5


In [38]:
#predicting the quarterbacks' success with the logistic regression model
temp_23_vectors = sm.add_constant(qbs_23_vectors, has_constant='add')
draft_predict = round(logistic_result.predict(temp_23_vectors))

In [42]:
#creating new dataframe with predictions of the logistic model
qb_prediction = pd.DataFrame()
qb_prediction['QB'] = qbs_23_names
qb_prediction['Success'] = draft_predict
qb_prediction

Unnamed: 0,QB,Success
0,C.J. Stroud,1.0
1,Bryce Young,0.0
2,Anthony Richardson,0.0
3,Will Levis,0.0
4,Hendon Hooker,1.0
5,Aidan O'Connell,0.0
6,Tanner McKee,0.0


# SVM Prediction

In [44]:
#predicting the quarterbacks' success with SVM model
svm_predicted_qb = clf.predict(qbs_23_vectors)

#creating new dataframe with predictions of the SVM model
svm_qb_prediction = pd.DataFrame()
svm_qb_prediction['QB'] = qbs_23_names
svm_qb_prediction['Success'] = svm_predicted_qb
svm_qb_prediction

Unnamed: 0,QB,Success
0,C.J. Stroud,1
1,Bryce Young,1
2,Anthony Richardson,0
3,Will Levis,1
4,Hendon Hooker,1
5,Aidan O'Connell,0
6,Tanner McKee,0
