# Regresison Modeling

# Generalizing Case for multiple positions

In [37]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process 
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import zipfile
import statsmodels.api as sm

In [39]:
qualitative_displacement_positions = ['QB','DT','FB','LB','OG','DE','WR']

In [41]:
qual_clusters = {}

with zipfile.ZipFile("qual_assignments.zip", "r") as zf:
    for name in zf.namelist():
        if name.endswith(".csv"):
            key = name.replace(".csv", "")
            with zf.open(name) as f:
                qual_clusters[key] = pd.read_csv(f)

In [43]:
def fuzzy_match(row, df_to_match, column_name):
    name = row['Player Name']
    matches = process.extract(name, df_to_match[column_name], scorer=fuzz.partial_ratio)
    best_match, score, _ = matches[0]
    return best_match if score >= 80 else None

player_id_mapping = pd.read_csv("player_id_mapping.csv")
player_id_mapping = player_id_mapping.drop(columns = ['Unnamed: 0'])

In [45]:
career_files = {
    'QB': 'qb_career.csv',
    'offense': 'offense_career.csv',
    'defense': 'defense_career.csv',
    'center': 'center_career.csv',
    'ol': 'ol_career.csv',
    'punting': 'punting_career.csv',
    'kicking': 'kicking_career.csv'
}

career_dfs = {key: pd.read_csv(filename, index_col=0) for key, filename in career_files.items()}

In [47]:
for key, df in career_dfs.items():

    df['matched_name'] = df.apply(lambda row: fuzzy_match(row, player_id_mapping, 'player_name'), axis=1)

    merged_df = df.merge(
        player_id_mapping,
        left_on='matched_name',
        right_on='player_name',
        how='left',
        suffixes=('', '_original')
    )

    df['player_id'] = merged_df['player_id']
    df = df.dropna(subset=['player_id'])
    df = df.drop(columns=['matched_name'])
    df['player_id'] = df['player_id'].astype(int)

    career_dfs[key] = df

qb_career = career_dfs['QB']

In [48]:
print(career_dfs)

{'QB':            Player Name  Pick Position Team  Draft Year  Start Season     G  \
0          Andrew Luck     1       QB  IND        2012          2012  55.0   
1   Robert Griffin III     2       QB  WAS        2012          2012  42.0   
2       Ryan Tannehill     8       QB  MIA        2012          2012  64.0   
3       Brandon Weeden    22       QB  CLE        2012          2012  34.0   
4       Brock Osweiler    57       QB  DEN        2012          2012  21.0   
..                 ...   ...      ...  ...         ...           ...   ...   
89          Kyle Trask    64       QB  TAM        2021          2021   7.0   
90         Kellen Mond    66       QB  MIN        2021          2021   1.0   
91         Davis Mills    67       QB  HOU        2021          2021  38.0   
92            Ian Book   133       QB  NOR        2021          2021   1.0   
93        Sam Ehlinger   218       QB  IND        2021          2021   8.0   

      GS  Wins  Losses  ...  AP MVP-1  AP OPoY-3  AP OPo

In [49]:
qual_clusters = {pos: df for pos, df in qual_clusters.items() if pos in qualitative_displacement_positions}
print(qual_clusters)

{'DE':        height    weight  Competitive  player_id  cluster  Unnamed: 0  \
0    0.961360  0.656566     0.928222      28090        0          15   
1    0.908810  0.454545     0.974679      28296        0          17   
2    0.956723  0.474747     0.484477      29006        2          20   
3    0.935085  0.424242     0.877509      28967        0          25   
4    0.924266  0.525253     0.917348      28988        0          27   
..        ...       ...          ...        ...      ...         ...   
218  0.947450  0.616162     0.888977     104150        0        2978   
219  0.947450  0.161616     0.484477     104185        2        2995   
220  0.916538  0.787879     0.484477     104522        3        3007   
221  0.914992  0.434343     0.234093     104497        1        3024   
222  0.925811  0.292929     0.484477     104197        2        3062   

           player_name pos_abbr  draft_year  
0       Quinton Coples       DE        2012  
1        Melvin Ingram       DE     

In [50]:
def compute_qb_career_success(df):
    volume_features = ['Pass Yds', 'Cmp', 'TD', 'G', 'GS']
    efficiency_features = ['Cmp%', 'Y/A', 'AY/A', 'TD%', 'Int', 'Rate', 'ANY/A']
    decision_features = ['Succ%', 'Sk%', 'Int']
    clutch_features = ['4QC', 'GWD']
    composite_feature = ['AV']
    
    all_features = volume_features + efficiency_features + decision_features + clutch_features + composite_feature
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[all_features]), columns=all_features, index=df.index)

    for col in ['Int', 'Sk%']:
        if col in df_scaled.columns:
            df_scaled[col] = 1 - df_scaled[col]

    df['career_success'] = (
        0.10 * df_scaled[clutch_features].mean(axis=1) +
        0.10 * df_scaled[volume_features].mean(axis=1) +
        0.25 * df_scaled[efficiency_features].mean(axis=1) +
        0.45 * df_scaled[decision_features].mean(axis=1) +
        0.10 * df_scaled['AV']
    )

    return df



In [51]:
def compute_defense_career_success(df):
    volume_features = ['G', 'GS', 'Comb', 'Solo', 'Ast']
    impact_features = ['Sk', 'TFL', 'QBHits', 'FF', 'FR', 'Int', 'PD', 'Sfty']
    composite_feature = ['AV']

    all_features = volume_features + impact_features + composite_feature
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[all_features]), columns=all_features, index=df.index)

    df['career_success'] = (
        0.25 * df_scaled[volume_features].mean(axis=1) +
        0.25 * df_scaled[impact_features].mean(axis=1) +
        0.50 * df_scaled['AV']
    )

    return df

In [52]:
def compute_center_career_success(df):
    durability = ['G', 'GS']
    offense = ['Offense Snaps', 'Offense Pct']
    special_teams = ['Special Teams Snaps', 'Special Teams Pct']

    all_features = durability + offense + special_teams

    df = df.dropna(subset=all_features)

    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[all_features]),
        columns=all_features,
        index=df.index
    )

    df['career_success'] = (
        0.3 * df_scaled[durability].mean(axis=1) +
        0.5 * df_scaled[offense].mean(axis=1) +
        0.2 * df_scaled[special_teams].mean(axis=1)
    )

    return df

In [53]:
def compute_wr_career_success(df):
    volume = ['Rec', 'Rec Yds', 'Rec TD', 'Tgt', 'G', 'GS']
    efficiency = ['Ctch%', 'Y/R', 'Y/Tgt', 'Rec Succ%', 'Rush Succ%']
    explosiveness = ['YScm', 'Y/Tch', 'Touch', 'RRTD']
    rushing = ['Att', 'Rush Yds', 'Rush TD', 'Rush Succ%']
    composite_feature = ['AV']

    all_features = volume + efficiency + explosiveness + rushing + composite_feature
    df = df.dropna(subset=all_features)
    
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[all_features]),
        columns=all_features,
        index=df.index
    )

    df['career_success'] = (
        0.30 * df_scaled[volume].mean(axis=1) +
        0.30 * df_scaled[efficiency].mean(axis=1) +
        0.10 * df_scaled[explosiveness].mean(axis=1) +
        0.10 * df_scaled[rushing].mean(axis=1) +
        0.20 * df_scaled['AV']
    )

    return df

In [54]:
def get_success(df, position):
    if position == 'QB':
        df = compute_qb_career_success(df)
    elif position in ['DT', 'LB', 'DE']:
        df = compute_defense_career_success(df)
    elif position == 'OG':
        df = compute_center_career_success(df)
    elif position == 'WR':
        df = compute_wr_career_success(df)
    else:
        print(f'Success for position unknown: {position}')
        return df
        
    return df


In [55]:
qualitative_displacement_positions = ['QB', 'DT', 'FB', 'LB', 'OG', 'DE', 'WR']

career_source_map = {
    'QB': ('QB', None),
    'DT': ('defense', 'DT'),
    'FB': ('offense', 'FB'),
    'LB': ('defense', 'LB'),
    'OG': ('center', 'G'),
    'DE': ('defense', 'DE'),
    'WR': ('offense', 'WR'),
}

merged_clusters = {}
merged_clusters_pick = {}

for pos in qualitative_displacement_positions:
    career_key, filter_pos = career_source_map.get(pos, (None, None))
    career_df = career_dfs.get(career_key)
    if career_df is None:
        print(f'No career data available')
        continue

    if filter_pos:
        career_df = career_df[career_df['Position'] == filter_pos]

    qual_df = qual_clusters.get(pos)

    merged_df = career_df.merge(qual_df, on='player_id', how='left')
    merged_clusters[pos] = merged_df
    merged_clusters[pos] = merged_clusters[pos].drop(columns = ['Unnamed: 0', 'player_name', 'pos_abbr'])
    merged_clusters[pos] = merged_clusters[pos].dropna(subset = ['cluster'])
    merged_clusters[pos] = merged_clusters[pos][merged_clusters[pos]['Draft Year'] <= 2020]
    merged_clusters_pick[pos] = merged_clusters[pos].copy()

    scaler = MinMaxScaler()
    merged_clusters[pos]['Pick'] = np.log(merged_clusters[pos]['Pick'] + 1)
    merged_clusters[pos]['Pick'] = scaler.fit_transform(merged_clusters[pos][['Pick']])
    update = get_success(merged_clusters[pos], pos)
    merged_clusters[pos] = update

Success for position unknown: FB


In [56]:
test = {
    pos: df[df['draft_year'] == 2020].copy()
    for pos, df in merged_clusters.items()
}
print(test)

{'QB':        Player Name      Pick Position Team  Draft Year  Start Season     G  \
73  Tua Tagovailoa  0.227535       QB  MIA        2020          2020  53.0   
74  Justin Herbert  0.259462       QB  LAC        2020          2020  62.0   
75     Jordan Love  0.539047       QB  GNB        2020          2020  42.0   
76     Jalen Hurts  0.682606       QB  PHI        2020          2020  62.0   
77     Jacob Eason  0.853101       QB  IND        2020          2020   2.0   
78      Jake Fromm  0.917674       QB  BUF        2020          2020   3.0   
79      Jake Luton  0.943161       QB  JAX        2020          2020   3.0   
80     Ben DiNucci  0.984524       QB  DAL        2020          2020   3.0   

      GS  Wins  Losses  ...  AP CPoY-12  AP MVP-4  AP OPoY-6  player_id  \
73  51.0  32.0    19.0  ...           0         0          0     104204   
74  62.0  30.0    32.0  ...           0         0          0     104093   
75  33.0  18.0    15.0  ...           0         0          0     

In [77]:
career_source_map = {
    'QB': ('QB', None),
    'DT': ('defense', 'DT'),
    'FB': ('offense', 'FB'),
    'LB': ('defense', 'LB'),
    'OG': ('center', 'G'),
    'DE': ('defense', 'DE'),
    'WR': ('offense', 'WR'),
}

position_models = {}

for pos, df in merged_clusters.items():
    if pos == 'FB':
        continue

    models = {}
    # Model 1 
    X1 = df[['Pick']]
    X1 = sm.add_constant(X1)
    y1 = df['career_success']
    models['pick'] = sm.OLS(y1, X1).fit()

    # Model 2 
    career_key, filter_pos = career_source_map.get(pos, (None, None))
    career_df = career_dfs.get(career_key)

    if career_df is not None:
        if filter_pos:
            career_df = career_df[career_df['Position'] == filter_pos]
        exclude = set(career_df.columns) | {'cluster', 'career_success'}
    else:
        print(f'No data for this position. Recheck careers')
        exclude = {'cluster', 'career_success'}
        
    selected_cols = [col for col in df.columns if col not in exclude]
    X2 = df[['Pick'] + selected_cols].copy()
    y2 = df['career_success']
    X2 = sm.add_constant(X2)
    models['pick_top_quant'] = sm.OLS(y2, X2).fit()

    # Model 3 
    df_model = pd.get_dummies(df, columns = ['cluster'], prefix = 'cluster', drop_first = False)
    y3 = df_model['career_success']
    cluster_cols = [col for col in df_model.columns if col.startswith('cluster_')]
    df_model[cluster_cols] = df_model[cluster_cols].astype(int)

    X3 = df_model[['Pick'] + cluster_cols]
    models['pick_quant_clusters'] = sm.OLS(y3, X3).fit()

    position_models[pos] = models


In [79]:
table_labels = ['pick', 'pick_top_quant', 'pick_quant_clusters']

for pos, result in position_models.items():
    if pos == 'WR':
        print(pos)
        for t in table_labels:
            print(position_models[pos][t].summary())

WR
                            OLS Regression Results                            
Dep. Variable:         career_success   R-squared:                       0.281
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     65.40
Date:                Wed, 09 Apr 2025   Prob (F-statistic):           1.19e-13
Time:                        21:37:07   Log-Likelihood:                 60.153
No. Observations:                 169   AIC:                            -116.3
Df Residuals:                     167   BIC:                            -110.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6191      0.042     14.640      

# Average draft pick per cluster

In [81]:
positions_to_check = ['QB', 'DT', 'FB', 'LB', 'OG', 'DE', 'WR']

avg_pick_by_cluster = {}

for pos in positions_to_check:
    
    df = merged_clusters_pick[pos]
    df['cluster'] = df['cluster'].astype(str)
    cluster_avg = df.groupby('cluster')['Pick'].mean().sort_index()
    avg_pick_by_cluster[pos] = cluster_avg

for pos, cluster_stats in avg_pick_by_cluster.items():
    print(f"\nPosition: {pos}")
    print(cluster_stats)



Position: QB
cluster
0.0     64.384615
1.0     83.800000
2.0    103.409091
3.0     94.520000
4.0     77.750000
5.0    108.666667
Name: Pick, dtype: float64

Position: DT
cluster
0.0     68.000000
1.0    118.750000
2.0    147.875000
3.0    244.000000
4.0     92.466667
5.0     76.875000
Name: Pick, dtype: float64

Position: FB
cluster
1.0    159.5
2.0    168.0
Name: Pick, dtype: float64

Position: LB
cluster
0.0    111.5000
1.0    122.0000
2.0    161.0625
3.0    118.0000
Name: Pick, dtype: float64

Position: OG
cluster
0.0     92.526316
1.0    144.052632
2.0    134.321429
3.0     92.916667
Name: Pick, dtype: float64

Position: DE
cluster
0.0     70.545455
1.0    162.000000
2.0    108.875000
3.0    109.285714
Name: Pick, dtype: float64

Position: WR
cluster
0.0     96.952381
1.0    115.400000
2.0    122.276596
3.0    159.714286
4.0    199.000000
5.0    111.200000
6.0     98.285714
7.0    100.555556
Name: Pick, dtype: float64


# Testing

In [103]:
from statsmodels.tools import add_constant

test_predictions = {}

table_labels = ['pick', 'pick_top_quant', 'pick_quant_clusters']

for pos, result in position_models.items():
    if pos not in test or pos == 'FB':
        continue  # Skip FB or missing test data

    df_test = test[pos].copy()
    preds = {}

    for model_name in table_labels:
        model = position_models[pos][model_name]
        train_features = model.model.exog_names  # Training features (includes 'const' if added)

        if model_name == 'pick':
            # Model 1: Pick only
            X = df_test[['Pick']].copy()
            X = add_constant(X, has_constant='add')

        elif model_name == 'pick_top_quant':
            # Model 2: Pick + top features
            X = df_test.copy()
            common_features = [f for f in train_features if f in X.columns]
            X = X[common_features]
            if 'const' in train_features and 'const' not in X.columns:
                X = add_constant(X, has_constant='add')

        elif model_name == 'pick_quant_clusters':
            # Model 3: Pick + cluster dummies
            df_dummies = pd.get_dummies(df_test, columns=['cluster'], prefix='cluster', drop_first=False)
            for col in train_features:
                if col not in df_dummies.columns and col != 'const':
                    df_dummies[col] = 0
            X = df_dummies[train_features]

        # Store predictions along with player names
        prediction_df = pd.DataFrame({
            'Player Name': df_test['Player Name'].values,
            'Predicted_Success': model.predict(X).values
        })
        preds[model_name] = prediction_df

    test_predictions[pos] = preds


In [116]:
for pos in test_predictions.keys():
    print(pos)
    print(test_predictions[pos]['pick_quant_clusters'])

QB
      Player Name Predicted_Success
0  Tua Tagovailoa          0.607068
1  Justin Herbert          0.567074
2     Jordan Love          0.526483
3     Jalen Hurts          0.508653
4     Jacob Eason          0.521848
5      Jake Fromm          0.479458
6      Jake Luton          0.482161
7     Ben DiNucci          0.477024
DT
      Player Name Predicted_Success
0  DaVon Hamilton          0.297751
1       Leki Fotu          0.235343
2    Khalil Davis          0.055057
LB
            Player Name Predicted_Success
0         Terrell Lewis           0.34863
1        Alex Highsmith          0.307932
2          Mykal Walker          0.211004
3              Troy Dye          0.178936
4  Shaquille Quarterman          0.177856
5             Cam Brown          0.110162
6          Cassh Maluia          0.087263
7         Clay Johnston          0.058326
OG
      Player Name Predicted_Success
0    Damien Lewis          0.539112
1   Jonah Jackson          0.522908
2  Logan Stenberg          0.48837

# On 2020 NFL Draft

In [134]:
draft = pd.read_excel('NFL_draft2020.xlsx', header = 1)
draft = draft[['Pick', 'Tm', 'Player', 'Pos']]
draft = draft[draft['Pos'].isin(qualitative_displacement_positions)]
print(draft)

     Pick   Tm           Player Pos
0       1  CIN       Joe Burrow  QB
1       2  WAS      Chase Young  DE
4       5  MIA   Tua Tagovailoa  QB
5       6  LAC   Justin Herbert  QB
6       7  CAR    Derrick Brown  DT
..    ...  ...              ...  ..
241   242  GNB  Jonathan Garvin  DE
243   244  MIN     Nate Stanley  QB
251   252  DEN  Tyrie Cleveland  WR
253   254  DEN    Derrek Tuszka  LB
254   255  NYG      Tae Crowder  LB

[122 rows x 4 columns]


In [199]:
# Make a copy of test_predictions so we can modify it
remaining_players = {pos: df['pick_quant_clusters'].copy() for pos, df in test_predictions.items()}

# Create a column in draft for model-based picks
draft['model_pick'] = None

# Iterate over each row in the draft
for idx, row in draft.iterrows():
    pos = row['Pos']

    # Skip if position not in predictions or no players left
    if pos not in remaining_players or remaining_players[pos].empty:
        continue

    df_pos = remaining_players[pos]

    # Get index of the top player
    top_idx = df_pos.index[0]
    top_player = df_pos.loc[top_idx, 'Player Name']

    # Assign to draft
    draft.at[idx, 'model_pick'] = top_player
    draft.at[idx, 'model_score'] = df_pos.loc[top_idx, 'Predicted_Success']

    # Drop the player by index label
    remaining_players[pos] = df_pos.drop(index=top_idx)



In [226]:
def fuzzy_match(row, df_to_match, column_name):
    name = row['Player']
    matches = process.extract(name, df_to_match[column_name], scorer=fuzz.partial_ratio)
    best_match, score, _ = matches[0]
    return best_match if score >= 80 else None
    
actual_success_values = []

for _, row in draft.iterrows():
    player = row['Player']
    pos = row['Pos']
    
    if pos in merged_clusters:
        df_pos = merged_clusters[pos]
        
        # Apply fuzzy matching to find the best player name
        matched_name = fuzzy_match(row, df_pos, 'Player Name')
        
        if matched_name:
            match_row = df_pos[df_pos['Player Name'] == matched_name]
            if not match_row.empty:
                success_val = match_row.iloc[0]['career_success']
            else:
                success_val = None
        else:
            success_val = None
    else:
        success_val = None

    actual_success_values.append(success_val)

# Add column to draft DataFrame
draft['actual_success'] = actual_success_values


In [248]:
draft = draft.dropna()
draft['improvement'] = (draft['model_score'] > draft['actual_success']).astype(int)

In [250]:
print(draft)

     Pick   Tm                Player Pos            model_pick  model_score  \
0       1  CIN            Joe Burrow  QB        Tua Tagovailoa     0.607068   
4       5  MIA        Tua Tagovailoa  QB        Justin Herbert     0.567074   
5       6  LAC        Justin Herbert  QB           Jordan Love     0.526483   
6       7  CAR         Derrick Brown  DT        DaVon Hamilton     0.297751   
13     14  SFO          Javon Kinlaw  DT             Leki Fotu     0.235343   
14     15  DEN           Jerry Jeudy  WR         Brandon Aiyuk     0.429338   
16     17  DAL           CeeDee Lamb  WR           Tee Higgins     0.371359   
21     22  MIN      Justin Jefferson  WR  Laviska Shenault Jr.     0.283106   
24     25  SFO         Brandon Aiyuk  WR         Van Jefferson     0.466036   
25     26  GNB           Jordan Love  QB           Jalen Hurts     0.508653   
32     33  CIN           Tee Higgins  WR           Denzel Mims     0.306993   
33     34  IND   Michael Pittman Jr.  WR       Lynn 