# NCAA.com Play-by-play Data Scraper
-

In [1]:
# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def fetch_schedule_page(date):
    """
    Fetches the NCAA men's ice hockey schedule page for a given date.
    """
    base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"
    url = f"{base_url}/{date.strftime('%Y/%m/%d')}/all-conf"
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_game_ids(html_content):
    """
    Extracts game IDs from the schedule page HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    game_links = soup.find_all('a', href=True)
    game_ids = set()
    for link in game_links:
        href = link['href']
        if '/game/' in href:
            parts = href.split('/')
            if len(parts) > 2 and parts[2].isdigit():
                game_ids.add(parts[2])
    return list(game_ids)

def construct_play_by_play_url(game_id):
    """
    Constructs the play-by-play URL for a given game ID.
    """
    return f"https://www.ncaa.com/game/{game_id}/play-by-play"

def fetch_play_by_play_data(game_id):
    """
    Fetches and processes the play-by-play data for a given game ID.
    """
    url = construct_play_by_play_url(game_id)
    response = requests.get(url)
    response.raise_for_status()
    # Process the play-by-play data as needed
    return response.text

def main(start_date, end_date):
    """
    Main function to scrape play-by-play data for all games between start_date and end_date.
    """
    current_date = start_date
    while current_date <= end_date:
        try:
            html_content = fetch_schedule_page(current_date)
            game_ids = extract_game_ids(html_content)
            for game_id in game_ids:
                play_by_play_data = fetch_play_by_play_data(game_id)
                # Process or store the play-by-play data as needed
                print(f"Processed play-by-play data for game ID: {game_id}")
        except requests.HTTPError as e:
            print(f"Failed to fetch data for {current_date.strftime('%Y-%m-%d')}: {e}")
        current_date += timedelta(days=1)

if __name__ == "__main__":
    # Define the date range for the season
    start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
    end_date = datetime.strptime("2025-03-08", "%Y-%m-%d")
    main(start_date, end_date)


Processed play-by-play data for game ID: 6344249
Processed play-by-play data for game ID: 6344335
Processed play-by-play data for game ID: 6344272
Processed play-by-play data for game ID: 6344337
Processed play-by-play data for game ID: 6344354
Processed play-by-play data for game ID: 6344336
Processed play-by-play data for game ID: 6344343
Processed play-by-play data for game ID: 6344351
Processed play-by-play data for game ID: 6344341
Processed play-by-play data for game ID: 6344348
Processed play-by-play data for game ID: 6344352
Processed play-by-play data for game ID: 6344179
Processed play-by-play data for game ID: 6344346
Processed play-by-play data for game ID: 6344350
Processed play-by-play data for game ID: 6344250
Processed play-by-play data for game ID: 6344347
Processed play-by-play data for game ID: 6344368
Processed play-by-play data for game ID: 6344349
Processed play-by-play data for game ID: 6344353
Processed play-by-play data for game ID: 6344273
Processed play-by-pl