In [None]:
# Import
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import csv
from sklearn.preprocessing import MinMaxScaler
import requests
from bs4 import BeautifulSoup, Comment
import time
import random
from requests.exceptions import SSLError, ConnectionError

# Getting Career Stats

In [None]:
def scrape_nfl_draft(year):
    url = f"https://www.pro-football-reference.com/years/{year}/draft.htm"
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'drafts'})

    rows = table.find('tbody').find_all('tr', class_=lambda x: x != 'thead')
    
    players_data = []
    for row in rows:
        try:
            pick_cell = row.find('td', {'data-stat': 'draft_pick'})
            player_cell = row.find('td', {'data-stat': 'player'})
            position_cell = row.find('td', {'data-stat': 'pos'})
            team_cell = row.find('td', {'data-stat': 'team'})
            college_cell = row.find('td', {'data-stat': 'college_id'})
            college_link_cell = row.find('td', {'data-stat': 'college_link'})

            if not (player_cell and position_cell and team_cell):
                continue

            pick = pick_cell.get_text(strip=True) if pick_cell else ''
            player_name = player_cell.find('a').text.strip() if player_cell.find('a') else player_cell.text.strip()
            player_link = f"https://www.pro-football-reference.com{player_cell.find('a')['href']}" if player_cell.find('a') else ''
            position = position_cell.get_text(strip=True)
            team = team_cell.get_text(strip=True)
            college_name = college_cell.find('a').text.strip() if college_cell and college_cell.find('a') else ''
            college_stats_link = college_link_cell.find('a')['href'] if college_link_cell and college_link_cell.find('a') else ''

            players_data.append({
                'Draft Year': year,
                'Pick': pick,
                'Player Name': player_name,
                'Player Link': player_link,
                'Position': position,
                'Team': team,
                'College': college_name,
                'College Stats Link': college_stats_link
            })

        except Exception as e:
            continue

    df = pd.DataFrame(players_data)
    return df

start_year = 2012
end_year = 2021
all_years_data = []

for year in range(start_year, end_year + 1):
    df_year = scrape_nfl_draft(year)
    all_years_data.append(df_year)


final_df = pd.concat(all_years_data, ignore_index=True)
final_df = final_df[['Draft Year','Pick', 'Team', 'Player Name', 'Player Link', 'Position']]
print(final_df)

In [None]:
def scrape_player_stats(player_url, player_position):
    response = requests.get(player_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    # Offensive Line
    if player_position == 'OL':
        combine_div = soup.find('div', {'id': 'div_combine'})
        if not combine_div:
            comments = soup.find_all(string=lambda text: isinstance(text, Comment))
            for comment in comments:
                if 'div_combine' in comment:
                    combine_div = BeautifulSoup(comment, 'html.parser').find('div', {'id': 'div_combine'})
                    break

        stats_table = combine_div.find('table', {'id': 'combine'}) if combine_div else None
        if not stats_table:
            print(f'OL table unavaialble for {player_url} so setting to 0')
            default_headers = [
                'Season', 'Pos', 'Ht', 'Wt', '40yd', 'Bench', 'Broad Jump', 
                'Shuttle', '3Cone', 'Vertical'
            ]
            empty_data = {col: 0 for col in default_headers}
            return pd.DataFrame([empty_data])

        headers = [
            'Season', 'Pos', 'Ht', 'Wt', '40yd', 'Bench', 'Broad Jump', 
            'Shuttle', '3Cone', 'Vertical'
        ]
        rows = stats_table.find('tbody').find_all('tr')
        player_stats = []
        for row in rows:
            row_data = [td.get_text(strip=True) for td in row.find_all(['th', 'td'])]
            if len(row_data) == len(headers):
                player_stats.append(row_data)

        stats_df = pd.DataFrame(player_stats, columns=headers)
        if 'Year' in stats_df.columns:
            stats_df.rename(columns={'Year': 'Season'}, inplace=True)
        if 'Season' in stats_df.columns:
            stats_df = stats_df[stats_df['Season'].str.isdigit()]
            stats_df['Season'] = stats_df['Season'].astype(int)

        return stats_df


    if player_position == 'P':
        table_id = 'punting'
    elif player_position == 'K':
        table_id = 'kicking'
    elif player_position == 'QB':
        table_id = 'passing'
    elif player_position in ['RB', 'FB']:
        table_id = 'rushing_and_receiving'
    elif player_position in ['WR', 'TE']:
        table_id = 'receiving_and_rushing'
    elif player_position in ['C', 'T', 'G', 'LS']:
        table_id = 'snap_counts'
    elif player_position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'OLB', 'ILB', 'S', 'DL']:
        table_id = 'defense'
    else:
        print('Unknown Position')
        return pd.DataFrame()

    stats_table = soup.find('table', {'id': table_id})

    
    # Fixing case where rushing/receving order in table is flipped
    if not stats_table and player_position in ['WR', 'TE']:
        print(f'receiving_and_rushing table unavailable for {player_url}. Flipping check')
        stats_table = soup.find('table', {'id': 'rushing_and_receiving'})
    
    if not stats_table and player_position in ['RB', 'FB']:
        print(f'rushing_an_receiving table unavailable for {player_url}. Flipping check')
        stats_table = soup.find('table', {'id': 'receiving_and_rushing'})

    # Case where player did not play a snap and doesn't have any statistics
    if not stats_table:
        print(f'Player has never played a snap and has no statistics. Giving 0 value')
        if player_position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'OLB', 'ILB', 'S', 'DL']:
            default_headers = [
                'Season', 'G', 'GS', 'Int', 'Int Yds', 'IntTD', 'Lng', 'PD', 'FF', 'Fmb', 
                'FR', 'FR Yds', 'FRTD', 'Sk', 'Comb', 'Solo', 'Ast', 'TFL', 'QBHits', 
                'Sfty', 'AV', 'Awards'
            ]
        elif player_position in ['C', 'T', 'G', 'LS']:
            default_headers = [
                'Season', 'G', 'GS', 'Offense Snaps', 'Offense Pct', 'Defense Snaps', 
                'Defense Pct', 'Special Teams Snaps', 'Special Teams Pct'
            ]
        elif player_position == 'P':
            default_headers = [
                'Season', 'Age', 'Team', 'Lg', 'Pos', 'G', 'GS', 'Pnt', 'Yds', 'Y/P', 
                'RetYds', 'NetYds', 'NY/P', 'Lng', 'TB', 'TB%', 'Pnt20', 'In20%', 'Blck', 
                'AV', 'Awards'
            ]
        elif player_position == 'K':
            default_headers = [
                'Season', 'Age', 'Team', 'Lg', 'Pos', 'G', 'GS',
                'FGA1', 'FGM1', 'FGA2', 'FGM2', 'FGA3', 'FGM3', 'FGA4', 'FGM4', 'FGA5', 'FGM5',
                'FGA', 'FGM', 'Lng', 'FG%', 'XPA', 'XPM', 'XP%', 
                'KO', 'KOYds', 'TB', 'TB%', 'KOAvg', 'AV', 'Awards'
            ]
        else:
            return pd.DataFrame()

        empty_data = {col: 0 if col != 'Awards' else '' for col in default_headers}
        return pd.DataFrame([empty_data])

    # Pulling headers and rows
    header_rows = stats_table.find('thead').find_all('tr')
    headers_row = header_rows[1] if len(header_rows) > 1 else header_rows[0]
    headers = [th.get_text(strip=True) for th in headers_row.find_all('th')]

    rows = stats_table.find('tbody').find_all('tr')
    player_stats = []
    for row in rows:
        row_data = [td.get_text(strip=True) for td in row.find_all(['th', 'td'])]
        if len(row_data) == len(headers):
            player_stats.append(row_data)

    stats_df = pd.DataFrame(player_stats, columns=headers)
    
    if player_position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'OLB', 'ILB', 'S', 'DL']:
        yds_columns = [i for i, col in enumerate(stats_df.columns) if col == 'Yds']
        if len(yds_columns) > 1:
            stats_df.columns.values[yds_columns[0]] = 'Int Yds'
            stats_df.columns.values[yds_columns[1]] = 'FR Yds'

    # Fixing duplicate column names for offensive positions
    if player_position in ['WR', 'TE']:
        first_is_rush = True
    elif player_position in ['RB', 'FB']:
        first_is_rush = False
    else:
        first_is_rush = None

    if first_is_rush is not None:
        renaming_map = {
            'Yds': 'Rec Yds' if first_is_rush else 'Rush Yds',
            'TD': 'Rec TD' if first_is_rush else 'Rush TD',
            '1D': 'Rec 1D' if first_is_rush else 'Rush 1D',
            'Lng': 'Rec Lng' if first_is_rush else 'Rush Lng',
            'Succ%': 'Rec Succ%' if first_is_rush else 'Rush Succ%',
            'Y/G': 'Rec Y/G' if first_is_rush else 'Rush Y/G'
        }
        for key, new_value in renaming_map.items():
            cols = [i for i, col in enumerate(stats_df.columns) if col == key]
            if len(cols) > 1:
                stats_df.columns.values[cols[0]] = new_value
                stats_df.columns.values[cols[1]] = 'Rush '+ key if first_is_rush else 'Rec ' + key

    # fixing duplicate column names for kicker
    if player_position == 'K':
        fga_cols = [i for i, col in enumerate(stats_df.columns) if col == 'FGA']
        fgm_cols = [i for i, col in enumerate(stats_df.columns) if col == 'FGM']

        if len(fga_cols) > 1:
            stats_df.columns.values[fga_cols[0]] = '0-19 FGA'
            stats_df.columns.values[fga_cols[1]] = '20-29 FGA'
            stats_df.columns.values[fga_cols[2]] = '30-39 FGA'
            stats_df.columns.values[fga_cols[3]] = '40-49 FGA'
            stats_df.columns.values[fga_cols[4]] = '50+ FGA'
            stats_df.columns.values[fga_cols[5]] = 'Total FGA'
        if len(fgm_cols) > 1:
            stats_df.columns.values[fgm_cols[0]] = '0-19 FGM'
            stats_df.columns.values[fgm_cols[1]] = '20-29 FGM'
            stats_df.columns.values[fgm_cols[2]] = '30-39 FGM'
            stats_df.columns.values[fgm_cols[3]] = '40-49 FGM'
            stats_df.columns.values[fgm_cols[4]] = '50+ FGM'
            stats_df.columns.values[fgm_cols[5]] = 'Total FGM'

    if player_position in ['C', 'T', 'G', 'LS']:
        stats_df.rename(columns={'Year': 'Season'}, inplace=True)
        rename_map = {
            'Num': ['Offense Snaps', 'Defense Snaps', 'Special Teams Snaps'],
            "Pct": ['Offense Pct', 'Defense Pct', 'Special Teams Pct']
        }
        num_columns = [i for i, col in enumerate(stats_df.columns) if col == 'Num']
        pct_columns = [i for i, col in enumerate(stats_df.columns) if col == 'Pct']

        if len(num_columns) == 3:
            stats_df.columns.values[num_columns[0]] = rename_map['Num'][0]
            stats_df.columns.values[num_columns[1]] = rename_map['Num'][1]
            stats_df.columns.values[num_columns[2]] = rename_map['Num'][2]

        if len(pct_columns) == 3:
            stats_df.columns.values[pct_columns[0]] = rename_map['Pct'][0]
            stats_df.columns.values[pct_columns[1]] = rename_map['Pct'][1]
            stats_df.columns.values[pct_columns[2]] = rename_map['Pct'][2]

    return stats_df

In [None]:
# One player at a time
player_index = 929 
row = final_df.iloc[player_index]

player_url = row['Player Link']
player_position = row['Position']

if player_url:
    print(f"Scraping stats for {row['Player Name']} ({player_position}) from {player_url}...")
    player_data = scrape_player_stats(player_url, player_position)

stat = player_data
print(stat)

In [None]:
def process_qb_data(qb_stats_df, start_season):
    
    if qb_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()

    qb_stats_df['Season'] = pd.to_numeric(qb_stats_df['Season'], errors='coerce')

    filtered_df = qb_stats_df.drop_duplicates(subset=['Season').copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, 'Season')

    if filtered_df.empty:
        print(f'No data available for first 4 seasons')
        return pd.DataFrame()


    # renaming duplicate columns
    yds_columns = [i for i, col in enumerate(filtered_df.columns) if col == 'Yds']
    if len(yds_columns) > 1:
        filtered_df.columns.values[yds_columns[0]] = 'Pass Yds'
        filtered_df.columns.values[yds_columns[1]] = 'Sack Yds'

    # Breaking record for Wins-Losses-Draws to Win, Losses, Draw
    if 'QBrec' in filtered_df.columns:
        qb_record_split = filtered_df['QBrec'].astype(str).str.split('-', expand=True)
        filtered_df['Wins'] = pd.to_numeric(qb_record_split[0], errors='coerce').fillna(0).astype(float)
        filtered_df['Losses'] = pd.to_numeric(qb_record_split[1], errors='coerce').fillna(0).astype(float)
        filtered_df['Draws'] = pd.to_numeric(qb_record_split[2], errors='coerce').fillna(0).astype(float)

    numeric_columns = [
    'G', 'GS', 'Wins', 'Losses', 'Draws', 'Cmp', 'Att', 'Cmp%', 'Pass Yds', 'TD', 'TD%', 'Int',
    '1D', 'Succ%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'Sk', 'Sack Yds', 'Sk%', 'NY/A', 'ANY/A',
    '4QC', 'GWD', 'AV'
    ]


    for col in numeric_columns:
        if col not in filtered_df.columns:
            filtered_df[col] = 0.0
            
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").fillna(0).astype(float)

    if 'Lng' in filtered_df.columns:
        filtered_df.drop(columns=['Lng'], inplace=True)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 if col != 'Awards' else '' for col in filtered_df.columns}
        empty_row['Season'] = missing_season
        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)

    # Aggreagating stats
    agg_methods = {
        **{col: 'sum' for col in numeric_columns 
           if col not in ['Cmp%', 'TD%', 'Succ%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'Sk%', 'NY/A', 'ANY/A']},

        **{col: 'mean' for col in ['Cmp%', 'TD%', 'Succ%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'Sk%', 'NY/A', 'ANY/A']},

        'Awards': lambda x: ', '.join(x.dropna()) if 'Awards' in filtered_df.columns else ''
    }

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T


    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats


In [None]:
def process_offense_data(offense_stats_df, start_season):

    if offense_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()

    offense_stats_df['Season'] = pd.to_numeric(offense_stats_df['Season'], errors='coerce')

    filtered_df = offense_stats_df.drop_duplicates(subset=['Season']).copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, 'Season')

    if filtered_df.empty:
        print(f'No available data for the first 4 seasons')
        return pd.DataFrame()

    numeric_columns = [
        'G', 'GS', 'Att', 'Rush Yds', 'Rush TD', 'Rush 1D', 'Tgt', 'Rec', 'Rec Yds',
        'Rec TD', 'Rec 1D', 'Touch', 'YScm', 'RRTD', 'Fmb', 'AV',
        'Rush Succ%', 'Y/A', 'Rush Y/G', 'A/G', 'Y/R', 'Rec Succ%', 'R/G',
        'Rec Y/G', 'Ctch%', 'Y/Tgt', 'Y/Tch'
    ]

    for col in numeric_columns:
        if col not in filtered_df.columns:
            filtered_df[col] = 0.0 
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce').fillna(0).astype(float)

    for col in ['Rush Lng', 'Rec Lng']:
        if col in filtered_df.columns:
            filtered_df.drop(columns=[col], inplace=True)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 if col != 'Awards' else '' for col in filtered_df.columns}
        empty_row['Season'] = missing_season
    
        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)

    # Aggreagating stats
    agg_methods = {
        **{col: 'sum' for col in numeric_columns 
           if col not in ['Rush Succ%', 'Y/A', 'Rush Y/G', 'A/G', 'Y/R', 'Rec Succ%', 'R/G', 'Rec Y/G', 'Ctch%', 'Y/Tgt', 'Y/Tch']},
        
        **{col: 'mean' for col in ['Rush Succ%', 'Y/A', 'Rush Y/G', 'A/G', 'Y/R', 'Rec Succ%', 'R/G', 'Rec Y/G', 'Ctch%', 'Y/Tgt', 'Y/Tch']},

        'Awards': lambda x: ', '.join(map(str, x.dropna().unique())) if 'Awards' in filtered_df.columns else ''
    }

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T

    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats



In [None]:
def process_defense_data(defense_stats_df, start_season):
    
    if defense_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()


    defense_stats_df['Season'] = pd.to_numeric(defense_stats_df['Season'], errors="coerce")

    filtered_df = defense_stats_df.drop_duplicates(subset=['Season']).copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, 'Season')

    if filtered_df.empty:
        print(f'No available data for the first 4 seasons')
        return pd.DataFrame()

    numeric_columns = [
        'G', 'GS', 'Int', 'Int Yds', 'IntTD', 'PD', 'FF', 'Fmb',
        'FR', 'FR Yds', 'FRTD', 'Sk', 'Comb', 'Solo', 'Ast', 'TFL',
        'QBHits', 'Sfty', 'AV'
    ]

    for col in numeric_columns:
        if col not in filtered_df.columns:
            filtered_df[col] = 0.0 
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").fillna(0).astype(float)

    if 'Lng' in filtered_df.columns:
        filtered_df.drop(columns=['Lng'], inplace=True)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 if col != 'Awards' else '' for col in filtered_df.columns}
        empty_row['Season'] = missing_season

        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)

    # Aggreagating stats
    agg_methods = {
        **{col: 'sum' for col in numeric_columns},
        
        'Awards': lambda x: ', '.join(map(str, x.dropna().unique())) if 'Awards' in filtered_df.columns else ''
    }

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T

    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats


In [None]:
def process_center_data(center_stats_df, start_season):

    if center_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()


    center_stats_df['Season'] = pd.to_numeric(center_stats_df['Season'], errors='coerce')

    filtered_df = center_stats_df.drop_duplicates(subset=['Season']).copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, 'Season')

    if filtered_df.empty:
        print(f'No available data for the first 4 seasons')
        return pd.DataFrame()

    # removing % from string columns to turn to floats
    pct_columns = ['Offense Pct', 'Defense Pct', 'Special Teams Pct']
    for col in pct_columns:
        if col in filtered_df.columns:
            filtered_df[col] = (
                filtered_df[col]
                .astype(str)
                .str.replace('%', '', regex=False)
                .replace('', '0')
                .astype(float)
                .fillna(0.0)
            )

    numeric_columns = [
        'G', 'GS', 'Offense Snaps', 'Offense Pct', 'Defense Snaps',
        'Defense Pct', 'Special Teams Snaps', 'Special Teams Pct'
    ]


    for col in numeric_columns:
        if col in filtered_df.columns:
            filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce').fillna(0).astype(float)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 if col != 'Awards' else '' for col in filtered_df.columns}
        empty_row['Season'] = missing_season

        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)
    
    # Aggreagating stats
    agg_methods = {
        "G": "sum",
        "GS": "sum",
        "Offense Snaps": "sum",
        "Defense Snaps": "sum",
        "Special Teams Snaps": "sum",
        "Offense Pct": "mean",
        "Defense Pct": "mean",
        "Special Teams Pct": "mean"
    }

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T

    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats


In [None]:
def process_punting_data(punting_stats_df, start_season):
    
    if punting_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()

    punting_stats_df['Season'] = pd.to_numeric(punting_stats_df['Season'], errors='coerce')

    filtered_df = punting_stats_df.drop_duplicates(subset=['Season']).copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, "Season")

    if filtered_df.empty:
    print(f'No available data for the first 4 seasons')
        return pd.DataFrame()

    numeric_columns = [
        'G', 'GS', 'Pnt', 'Yds', 'Y/P', 'RetYds', 'NetYds', 'NY/P',
        'Lng', 'TB', 'TB%', 'Pnt20', 'In20%', 'Blck', 'AV'
    ]

    for col in numeric_columns:
        if col not in filtered_df.columns:
            filtered_df[col] = 0.0
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").fillna(0).astype(float)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 for col in numeric_columns}
        empty_row['Season'] = missing_season
        if 'Awards' in filtered_df.columns:
            empty_row['Awards'] = ''
        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)

    # Aggreagating stats
    agg_methods = {
        **{col: "sum" for col in numeric_columns if col not in ["Y/P", "NY/P", "TB%", "In20%"]},

        **{col: "mean" for col in ["Y/P", "NY/P", "TB%", "In20%"]},
    }

    if 'Awards' in filtered_df.columns:
        agg_methods['Awards'] = lambda x: ', '.join(map(str, x.dropna().unique()))

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T

    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats


In [None]:
def process_kicking_data(kicking_stats_df, start_season):

    if kicking_stats_df.empty:
        print(f'No data available')
        return pd.DataFrame()

    kicking_stats_df['Season'] = pd.to_numeric(kicking_stats_df['Season'], errors='coerce')

    filtered_df = kicking_stats_df.drop_duplicates(subset=['Season']).copy()

    # first 4 seasons
    filtered_df = filtered_df.nsmallest(4, "Season")

    if filtered_df.empty:
        print(f'No available data for the first 4 seasons')
        return pd.DataFrame()

    numeric_columns = [
        'G', 'GS', '0-19 FGA', '0-19 FGM', '20-29 FGA', '20-29 FGM',
        '30-39 FGA', '30-39 FGM', '40-49 FGA', '40-49 FGM', '50+ FGA', '50+ FGM',
        'Total FGA', 'Total FGM', 'Lng', 'FG%', 'XPA', 'XPM', 'XP%',
        'KO', 'KOYds', 'TB', 'TB%', 'KOAvg', 'AV'
    ]

    for col in numeric_columns:
        if col not in filtered_df.columns:
            filtered_df[col] = 0.0
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").fillna(0.0).astype(float)

    # making sure we have at least 4 seasons and if not filling values with 0 
    while len(filtered_df) < 4:
        missing_season = filtered_df['Season'].max() + 1 if not filtered_df.empty else start_season
        empty_row = {col: 0.0 for col in numeric_columns}
        empty_row['Season'] = missing_season
        empty_row['Awards'] = ''
        filtered_df = pd.concat([filtered_df, pd.DataFrame([empty_row])], ignore_index=True)
        
    # Aggreagating stats
    agg_methods = {
        **{col: 'sum' for col in numeric_columns if col not in ['FG%', 'XP%', 'TB%', 'KOAvg']},

        **{col: 'mean' for col in ['FG%', 'XP%', 'TB%', 'KOAvg']},

        'Awards': lambda x: ', '.join(map(str, x.dropna().unique())) if 'Awards' in filtered_df.columns else ''
    }

    aggregated_stats = filtered_df.agg(agg_methods).to_frame().T

    for col in numeric_columns:
        if col in aggregated_stats.columns:
            aggregated_stats[col] = aggregated_stats[col].astype(float)

    aggregated_stats.insert(0, 'Start Season', start_season)

    return aggregated_stats

In [None]:
def process_player_data(stats_df, player_position):

    position = player_position
    start_season = stats_df['Season'].iloc[0]
    if position == 'QB':
        return process_qb_data(stats_df, start_season)
    elif position in ['RB', 'WR', 'TE', 'FB']:
        return process_offense_data(stats_df, start_season)
    elif position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'OLB', 'ILB', 'S', 'DL']:
        return process_defense_data(stats_df, start_season)
    elif position in ['C', 'T', 'G', 'LS']:
        return process_center_data(stats_df, start_season)
    elif position == 'OL':
        return stats_df
    elif position == 'P':
        return process_punting_data(stats_df, start_season)
    elif position == 'K':
        return process_kicking_data(stats_df, start_season)
    else:
        print(f'Unknown position')
        return pd.DataFrame()


In [None]:
processed = process_player_data(stat, player_position)
print(processed)

In [None]:
def fetch_with_adaptive_delay(url, base_delay_range=(3, 7), request_counter=[0], adaptive_delay=[3]):

    attempt = 0
    while True:
        try:
            # waiting before immediately sending request
            current_delay = adaptive_delay[0] + random.uniform(*base_delay_range)
            time.sleep(current_delay)

            response = requests.get(url)

            if response.status_code == 200:
                request_counter[0] += 1
                if request_counter[0] % 10 == 0:
                    # delay
                    time.sleep(random.uniform(10, 20))  
                return response

            elif response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', adaptive_delay[0] * 2))
                print(f'Blocked by rate limit. Retrying in {retry_after} seconds')
                adaptive_delay[0] *= 2
                time.sleep(retry_after)

            else:
                print(f"Request failed. Attempting again")
                attempt += 1

        except (SSLError, ConnectionError) as e:
            print(f'Retrying with next attempt having a longer delay')
            adaptive_delay[0] *= 2
            time.sleep(adaptive_delay[0])
            attempt += 1



In [None]:
request_counter = [0]
adaptive_delay = [3]

qb_data = []
offense_data = []
defense_data = []
center_data = []
ol_data = []
punting_data = []
kicking_data = []

for index, row in final_df.iterrows():
    player_url = row['Player Link']
    player_pick = row['Pick']
    player_position = row['Position']
    player_name = row['Player Name'] 
    player_team = row['Team'] 


    start_season = row["Draft Year"]

    if player_url:
        print(f'Scraping stats for {player_name} ({player_position}) from {player_url}')

        response = fetch_with_adaptive_delay(player_url, request_counter=request_counter, adaptive_delay=adaptive_delay)

        if response is not None:
            player_data = scrape_player_stats(player_url, player_position)

            if not player_data.empty:
                if player_position == 'QB':
                    processed_row = process_qb_data(player_data, start_season)
                elif player_position in ['RB', 'WR', 'TE', 'FB']:
                    processed_row = process_offense_data(player_data, start_season)
                elif player_position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'OLB', 'ILB', 'S', 'DL']:
                    processed_row = process_defense_data(player_data, start_season)
                elif player_position in ['C', 'T', 'G', 'LS']:
                    processed_row = process_center_data(player_data, start_season)
                elif player_position == 'OL':
                    processed_row = player_data  # No processing function, just storing raw data
                elif player_position == 'P':
                    processed_row = process_punting_data(player_data, start_season)
                elif player_position =='K':
                    processed_row = process_kicking_data(player_data, start_season)
                else:
                    continue 

                processed_row = processed_row.to_frame().T

                processed_row.insert(0, 'Player Name', player_name)
                processed_row.insert(1, 'Pick', player_pick)
                processed_row.insert(2, 'Position', player_position)
                processed_row.insert(3, 'Team', player_team)
                processed_row.insert(4, 'Draft Year', start_season)

                if player_position == 'QB':
                    qb_data.append(processed_row)
                elif player_position in ['RB', 'WR', 'TE', 'FB']:
                    offense_data.append(processed_row)
                elif player_position in ['DB', 'LB', 'DE', 'DT', 'NT', 'CB', 'DL', 'OLB', 'ILB', 'S']:
                    defense_data.append(processed_row)
                elif player_position in ['C', 'T', 'G', 'LS']:
                    center_data.append(processed_row)
                elif player_position == 'OL':
                    ol_data.append(processed_row)
                elif player_position == 'P':
                    punting_data.append(processed_row)
                elif player_position == 'K':
                    kicking_data.append(processed_row)

qb_df = pd.concat(qb_data, ignore_index=True) if qb_data else pd.DataFrame()
offense_df = pd.concat(offense_data, ignore_index=True) if offense_data else pd.DataFrame()
defense_df = pd.concat(defense_data, ignore_index=True) if defense_data else pd.DataFrame()
center_df = pd.concat(center_data, ignore_index=True) if center_data else pd.DataFrame()
ol_df = pd.concat(ol_data, ignore_index=True) if ol_data else pd.DataFrame()
punting_df = pd.concat(punting_data, ignore_index=True) if punting_data else pd.DataFrame()
kicking_df = pd.concat(kicking_data, ignore_index=True) if kicking_data else pd.DataFrame()

In [None]:
print(center_df['Position'].unique())

In [None]:
def process_awards(df):
    df = df.copy()

    df['Awards List'] = df['Awards'].apply(lambda x: [award.strip() for award in x.split(',')] if x else [])

    all_awards = set()
    for awards in df['Awards List']:
        all_awards.update(awards)

    all_awards.discard('')

    for award in all_awards:
        df[award] = df['Awards List'].apply(lambda x: x.count(award))

    df.drop(columns=["Awards List"], inplace=True)

    return df

In [None]:
qb_df = process_awards(qb_df)
offense_df = process_awards(offense_df)
defense_df = process_awards(defense_df)
punting_df = process_awards(punting_df)
kicking_df = process_awards(kicking_df)

In [None]:
qb_df.to_csv('qb_career.csv')
offense_df.to_csv('offense_career.csv')
defense_df.to_csv('defense_career.csv')
center_df.to_csv('center_career.csv')
ol_df.to_csv('ol_career.csv')
punting_df.to_csv('punting_career.csv')
kicking_df.to_csv('kicking_career.csv')