# Regresison Modeling

# Generalizing Case for multiple positions

In [None]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process 
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import pickle

In [None]:
qualitative_displacement_positions = ['QB','DT','FB','LB','OG','DE','WR']

In [None]:
with open('quant_assignments.pkl', 'rb') as f:
    quant_clusters = pickle.load(f)
    
print(quant_clusters['WR'])

In [None]:
def fuzzy_match(row, df_to_match, column_name):
    name = row['Player Name']
    matches = process.extract(name, df_to_match[column_name], scorer=fuzz.partial_ratio)
    best_match, score, _ = matches[0]
    return best_match if score >= 80 else None

player_id_mapping = pd.read_csv("player_id_mapping.csv")
player_id_mapping = player_id_mapping.drop(columns = ['Unnamed: 0'])

In [None]:
career_files = {
    'QB': 'qb_career.csv',
    'offense': 'offense_career.csv',
    'defense': 'defense_career.csv',
    'center': 'center_career.csv',
    'ol': 'ol_career.csv',
    'punting': 'punting_career.csv',
    'kicking': 'kicking_career.csv'
}

career_dfs = {key: pd.read_csv(filename, index_col=0) for key, filename in career_files.items()}



In [None]:
for key, df in career_dfs.items():

    df['matched_name'] = df.apply(lambda row: fuzzy_match(row, player_id_mapping, 'player_name'), axis=1)

    merged_df = df.merge(
        player_id_mapping,
        left_on='matched_name',
        right_on='player_name',
        how='left',
        suffixes=('', '_original')
    )

    df['player_id'] = merged_df['player_id']
    df = df.dropna(subset=['player_id'])
    df = df.drop(columns=['matched_name'])
    df['player_id'] = df['player_id'].astype(int)
    career_dfs[key] = df


qb_career = career_dfs['QB']

In [None]:
quant_clusters = {pos: df for pos, df in quant_clusters.items() if pos in qualitative_displacement_positions}
print(quant_clusters.get('WR'))

In [None]:
def compute_qb_career_success(df):
    volume_features = ['Pass Yds', 'Cmp', 'TD', 'G', 'GS']
    efficiency_features = ['Cmp%', 'Y/A', 'AY/A', 'TD%', 'Int', 'Rate', 'ANY/A']
    decision_features = ['Succ%', 'Sk%', 'Int']
    clutch_features = ['4QC', 'GWD']
    composite_feature = ['AV']
    
    all_features = volume_features + efficiency_features + decision_features + clutch_features + composite_feature
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[all_features]), columns=all_features, index=df.index)

    for col in ['Int', 'Sk%']:
        if col in df_scaled.columns:
            df_scaled[col] = 1 - df_scaled[col]

    df['career_success'] = (
        0.10 * df_scaled[clutch_features].mean(axis=1) +
        0.10 * df_scaled[volume_features].mean(axis=1) +
        0.25 * df_scaled[efficiency_features].mean(axis=1) +
        0.45 * df_scaled[decision_features].mean(axis=1) +
        0.10 * df_scaled['AV']
    )

    return df



In [None]:
def compute_defense_career_success(df):
    volume_features = ['G', 'GS', 'Comb', 'Solo', 'Ast']
    impact_features = ['Sk', 'TFL', 'QBHits', 'FF', 'FR', 'Int', 'PD', 'Sfty']
    composite_feature = ['AV']

    all_features = volume_features + impact_features + composite_feature
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[all_features]), columns=all_features, index=df.index)

    df['career_success'] = (
        0.25 * df_scaled[volume_features].mean(axis=1) +
        0.25 * df_scaled[impact_features].mean(axis=1) +
        0.50 * df_scaled['AV']
    )

    return df

In [None]:
def compute_center_career_success(df):
    durability = ['G', 'GS']
    offense = ['Offense Snaps', 'Offense Pct']
    special_teams = ['Special Teams Snaps', 'Special Teams Pct']

    all_features = durability + offense + special_teams

    df = df.dropna(subset=all_features)

    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[all_features]),
        columns=all_features,
        index=df.index
    )

    df['career_success'] = (
        0.3 * df_scaled[durability].mean(axis=1) +
        0.5 * df_scaled[offense].mean(axis=1) +
        0.2 * df_scaled[special_teams].mean(axis=1)
    )

    return df


In [None]:
def compute_wr_career_success(df):
    volume = ['Rec', 'Rec Yds', 'Rec TD', 'Tgt', 'G', 'GS']
    efficiency = ['Ctch%', 'Y/R', 'Y/Tgt', 'Rec Succ%', 'Rush Succ%']
    explosiveness = ['YScm', 'Y/Tch', 'Touch', 'RRTD']
    rushing = ['Att', 'Rush Yds', 'Rush TD', 'Rush Succ%']
    composite_feature = ['AV']

    
    all_features = volume + efficiency + explosiveness + rushing + composite_feature
    df = df.dropna(subset=all_features)
    
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df[all_features]),
        columns=all_features,
        index=df.index
    )

    df['career_success'] = (
        0.30 * df_scaled[volume].mean(axis=1) +
        0.30 * df_scaled[efficiency].mean(axis=1) +
        0.10 * df_scaled[explosiveness].mean(axis=1) +
        0.10 * df_scaled[rushing].mean(axis=1) +
        0.20 * df_scaled['AV']
    )

    return df


In [None]:
def get_success(df, position):
    if position == 'QB':
        df = compute_qb_career_success(df)
    elif position in ['DT', 'LB', 'DE']:
        df = compute_defense_career_success(df)
    elif position == 'OG':
        df = compute_center_career_success(df)
    elif position == 'WR':
        df = compute_wr_career_success(df)
    else:
        print(f'Success for position unknown: {position}')
        return df

    return df


In [None]:
qualitative_displacement_positions = ['QB', 'DT', 'FB', 'LB', 'OG', 'DE', 'WR']

career_source_map = {
    'QB': ('QB', None),
    'DT': ('defense', 'DT'),
    'FB': ('offense', 'FB'),
    'LB': ('defense', 'LB'),
    'OG': ('center', 'G'),
    'DE': ('defense', 'DE'),
    'WR': ('offense', 'WR'),
}

merged_clusters = {}
merged_clusters_pick = {}

for pos in qualitative_displacement_positions:

    career_key, filter_pos = career_source_map.get(pos, (None, None))
    career_df = career_dfs.get(career_key)
    if career_df is None:
        print(f"No career data available")
        continue

    if filter_pos:
        career_df = career_df[career_df['Position'] == filter_pos]

    quant_df = quant_clusters.get(pos)

    merged_df = career_df.merge(quant_df, on='player_id', how='left')
    merged_clusters[pos] = merged_df
    merged_clusters[pos] = merged_clusters[pos].drop(columns = ['Unnamed: 0', 'player_name', 'pos_abbr', 'draft_year'])
    merged_clusters[pos] = merged_clusters[pos].dropna(subset = ['cluster'])
    merged_clusters[pos] = merged_clusters[pos][merged_clusters[pos]['Draft Year'] <= 2020]
    merged_clusters_pick[pos] = merged_clusters[pos].copy()

    scaler = MinMaxScaler()
    merged_clusters[pos]['Pick'] = np.log(merged_clusters[pos]['Pick'] + 1)
    merged_clusters[pos]['Pick'] = scaler.fit_transform(merged_clusters[pos][['Pick']])
    update = get_success(merged_clusters[pos], pos)
    merged_clusters[pos] = update


In [None]:
print(merged_clusters['WR'].columns)

In [None]:
career_source_map = {
    'QB': ('QB', None),
    'DT': ('defense', 'DT'),
    'FB': ('offense', 'FB'),
    'LB': ('defense', 'LB'),
    'OG': ('center', 'G'),
    'DE': ('defense', 'DE'),
    'WR': ('offense', 'WR'),
}

position_models = {}

for pos, df in merged_clusters.items():
    if pos == 'FB':
        continue

    models = {}

    # Model 1
    X1 = df[['Pick']]
    X1 = sm.add_constant(X1)
    y1 = df['career_success']
    models['pick'] = sm.OLS(y1, X1).fit()

    # Model 2
    career_key, filter_pos = career_source_map.get(pos, (None, None))
    career_df = career_dfs.get(career_key)

    if career_df is not None:
        if filter_pos:
            career_df = career_df[career_df['Position'] == filter_pos]
        exclude = set(career_df.columns) | {'cluster', 'career_success'}
    else:
        print(f'No data for this position. Recheck careers')
        exclude = {'cluster', 'career_success'}
        
    selected_cols = [col for col in df.columns if col not in exclude]
    X2 = df[['Pick'] + selected_cols].copy()
    y2 = df['career_success']
    X2 = sm.add_constant(X2)
    models['pick_top_quant'] = sm.OLS(y2, X2).fit()

    # Model 3 
    df_model = pd.get_dummies(df, columns = ['cluster'], prefix = 'cluster', drop_first = False)
    y3 = df_model['career_success']
    cluster_cols = [col for col in df_model.columns if col.startswith('cluster_')]
    df_model[cluster_cols] = df_model[cluster_cols].astype(int)

    X3 = df_model[['Pick'] + cluster_cols]
    models['pick_quant_clusters'] = sm.OLS(y3, X3).fit()


    position_models[pos] = models

In [None]:
table_labels = ['pick', 'pick_top_quant', 'pick_quant_clusters']

for pos, result in position_models.items():
    if pos == 'LB':
        print(pos)
        for t in table_labels:
            print(position_models[pos][t].summary())

# Avg Pick per cluster

In [None]:
positions_to_check = ['QB', 'DT', 'FB', 'LB', 'OG', 'DE', 'WR']

avg_pick_by_cluster = {}

for pos in positions_to_check:
    
    df = merged_clusters_pick[pos]
    df['cluster'] = df['cluster'].astype(str)
    cluster_avg = df.groupby('cluster')['Pick'].mean().sort_index()
    avg_pick_by_cluster[pos] = cluster_avg

for pos, cluster_stats in avg_pick_by_cluster.items():
    print(f"\nPosition: {pos}")
    print(cluster_stats)