In [2]:
# imports
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from tabulate import tabulate


# set up webdriver with headless mode
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Enable headless mode
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [3]:
# set up webDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# go to the ESPN NFL scoreboard page
driver.get("https://www.espn.com/nfl/scoreboard")
time.sleep(5)  # Allow page to load


In [4]:


wait = WebDriverWait(driver, 10)  # Define wait here

In [None]:
# start webdriver 
wait = WebDriverWait(driver, 15)  

all_links = []

while True:
    # ising javascript 
    elements = driver.execute_script(
        "return [...document.querySelectorAll('.Arrow.flex.justify-center.items-center.Arrow--left')]"
    )

    # check if any elements were found
    if not elements:
        print("No elements found.")
        break

    # get the href using javascript 
    week_js1 = driver.execute_script(
        "return [...document.querySelectorAll('div.Week.currentWeek div.Week__wrapper div.custom--week a')].map(e => e.href);"
    )


    # filter links with season 2 in it 
    filtered_links = [link for link in week_js1 if "/seasontype/2" in link]
    
    # add the extracted links to the list
    # convert to set to automatically remove duplicates
    all_links = list(set(all_links + filtered_links))   

    # Use javascript to check if the button is disabled
    is_disabled = driver.execute_script(
        """
        return document.querySelector('.Arrow.flex.justify-center.items-center.Arrow--left').classList.contains('disabled');
        """
    )

    # if the button is disabled, stop the loop
    if is_disabled:
        print("Reached the beginning of the carousel. Stopping.")
        break

    # click the first element in the list to go to the previous carousel page
    elements[0].click()

# Print all the collected links after the loop finishes
print("All links collected:")
for link in all_links:
    print(link)


In [None]:

# list to store boxscore links
boxscore_links = []


# loop through each week link
for link in all_links:
    try:
        print(f"\nNavigating to: {link}")
        driver.get(link)  # Open the page in the browser

        # Wait for the page to fully load
        wait.until(EC.presence_of_element_located((By.ID, "fittPageContainer")))


        # Extract the week number from the URL
        week_number = link.split("/week/")[1].split("/")[0]  # Extracts the "18" from "/week/18/"

        # Extract boxscore links
        boxscores = driver.find_elements(By.XPATH, '//a[contains(@href, "/boxscore/")]')

        for item in boxscores:
            boxscore = item.get_attribute('href')
            if boxscore and '/boxscore/' in boxscore:
                boxscore_links.append(boxscore)  # Append only the link (no dictionary)
                print(f"Found boxscore link: {boxscore}")

        # Stop after processing one week
        # weeks_processed += 1
        if weeks_processed >= max_weeks_to_process:
            break  

    except Exception as e:
        print(f"Error processing {link}: {e}")

# Print collected boxscore links
print("\nAll boxscore links collected:")
for link in boxscore_links:
    game_ids = [link.split("gameId/")[-1] for link in boxscore_links]
    print(game_ids)

In [7]:
driver.quit()
# I need to restrucue this into three tables then union it 
# passing, rushing, receving 

In [None]:
import requests
import pandas as pd
from sqlalchemy import create_engine
import time
from tqdm import tqdm  # for progress bar (install with pip install tqdm)

# MySQL Database Credentials
DB_USER = "root"
DB_PASSWORD = "_Joseph344"
DB_HOST = "localhost"
DB_NAME = "NFL_Project2"

engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")

def get_player_stats(game_id, retries=3):
    """Fetches player stats and exports offense and defense data to MySQL."""
    url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/summary?event={game_id}"
    scoreboard_url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard/{game_id}"
    
    for attempt in range(retries):
        try:
            # Add delay between requests to avoid rate limiting
            time.sleep(0.5)  
            
            response = requests.get(url, timeout=10)
            s_response = requests.get(scoreboard_url, timeout=10)

            response.raise_for_status()
            s_response.raise_for_status()

            score = s_response.json()
            data = response.json()

            season_type = score.get("season", {}).get("slug", "na")
            players_data = data.get("boxscore", {}).get("players", [])
            
            all_offense = []
            all_defense = []
            week = data.get("header", {}).get("week", "na")

            for team in players_data:
                team_name = team.get("team", {}).get("displayName", "Unknown Team")
                team_id = team.get("team", {}).get("id", "na")
                
                for stat_group in team.get("statistics", []):
                    stat_type = stat_group.get("name", "Unknown Stats")
                    labels = stat_group.get("keys", [])
                    
                    for player in stat_group.get("athletes", []):
                        player_name = player.get("athlete", {}).get("displayName", "Unknown Player")
                        player_id = player.get("athlete", {}).get("id", "Unknown ID")
                        stats = player.get("stats", [])
                        
                        player_stats = {
                            "Player_ID": player_id,
                            "Game_ID": game_id,
                            "Team": team_name,
                            "season_type": season_type,  
                            "Player": player_name,
                            "Team_Id": team_id,
                            "Week": week
                        }
                        player_stats.update(dict(zip(labels, stats)))
                        
                        if stat_type.lower() in ["passing", "rushing", "receiving"]:
                            all_offense.append(player_stats)
                        elif stat_type.lower() == "defensive":
                            all_defense.append(player_stats)

            # Convert to pandas DataFrames
            df_offense = pd.DataFrame(all_offense) if all_offense else pd.DataFrame()
            df_defense = pd.DataFrame(all_defense) if all_defense else pd.DataFrame()

            # Export to MySQL
            if not df_offense.empty:
                df_offense.to_sql("offense_stats", con=engine, if_exists="append", index=False)
                print(f"Offensive stats for Game ID {game_id} inserted into MySQL ✅")
            
            if not df_defense.empty:
                df_defense.to_sql("defense_stats", con=engine, if_exists="append", index=False)
                print(f"Defensive stats for Game ID {game_id} inserted into MySQL ✅")
            
            # If successful, break out of retry loop
            break

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed for Game ID {game_id}: {e}")
            if attempt == retries - 1:
                print(f"Failed to process Game ID {game_id} after {retries} attempts")
            time.sleep(2)  # Wait longer between retries
        except Exception as e:
            print(f"Unexpected error processing Game ID {game_id}: {e}")
            break
        
if __name__ == "__main__":
    # Extract game IDs from boxscore_links
    game_ids = [link.split("gameId/")[-1] for link in boxscore_links]

    
    # Process each game ID with progress bar
    for game_id in tqdm(game_ids, desc="Processing Games"):
        print(f"\nProcessing Game ID: {game_id}")
        get_player_stats(game_id)