# NCAA.com Play-by-play Data Scraper
-

In [1]:
# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [10]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def fetch_schedule_page(date):
    """
    Fetches the NCAA men's ice hockey schedule page for a given date.
    """
    base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"
    url = f"{base_url}/{date.strftime('%Y/%m/%d')}/all-conf"
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Test the fetch_schedule_page function
date = datetime.strptime("2024-10-04", "%Y-%m-%d")
html_content = fetch_schedule_page(date)
print(html_content)


def extract_game_ids(html_content):
    """
    Extracts game IDs from the schedule page HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    game_links = soup.find_all('a', href=True)
    game_ids = set()
    for link in game_links:
        href = link['href']
        if '/game/' in href:
            parts = href.split('/')
            if len(parts) > 2 and parts[2].isdigit():
                game_ids.add(parts[2])
    return list(game_ids)

def construct_play_by_play_url(game_id):
    """
    Constructs the play-by-play URL for a given game ID.
    """
    return f"https://www.ncaa.com/game/{game_id}/play-by-play"

### Construct a dataframe of all games for the season Date, GameID, URL, Home Team, Away Team
def main(start_date, end_date):
    """
    Main function to scrape schedule data for all games between start_date and end_date.
    """
    current_date = start_date
    while current_date <= end_date:
        try:
            html_content = fetch_schedule_page(current_date)
            game_ids = extract_game_ids(html_content)
            for game_id in game_ids:
                play_by_play_url = construct_play_by_play_url(game_id)
                # Process or store the play-by-play URL as needed
                print(f"Game ID: {game_id}, URL: {play_by_play_url}")
        except requests.HTTPError as e:
            print(f"Failed to fetch data for {current_date.strftime('%Y-%m-%d')}: {e}")
        current_date += timedelta(days=1)

if __name__ == "__main__":
    # Define the date range for the season
    start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
    end_date = datetime.strptime("2024-10-31", "%Y-%m-%d")
    main(start_date, end_date)

# Display the first 5 games in the data
print(game_ids[:5])


## Prints simpl
# def main(start_date, end_date):
#     """
#     Main function to scrape schedule data for all games between start_date and end_date.
#     """
#     current_date = start_date
#     while current_date <= end_date:
#         try:
#             html_content = fetch_schedule_page(current_date)
#             game_ids = extract_game_ids(html_content)
#             for game_id in game_ids:
#                 play_by_play_url = construct_play_by_play_url(game_id)
#                 # Process or store the play-by-play URL as needed
#                 print(f"Game ID: {game_id}, URL: {play_by_play_url}")
#         except requests.HTTPError as e:
#             print(f"Failed to fetch data for {current_date.strftime('%Y-%m-%d')}: {e}")
#         current_date += timedelta(days=1)

# if __name__ == "__main__":
#     # Define the date range for the season
#     start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
#     end_date = datetime.strptime("2025-03-08", "%Y-%m-%d")
#     main(start_date, end_date)

# Display the first 5 games in the data





### TAAKEN OUT 1/26/25 - Unessary for this project - found a better way to fetch data
# def fetch_play_by_play_data(game_id):
#     """
#     Fetches and processes the play-by-play data for a given game ID.
#     """
#     url = construct_play_by_play_url(game_id)
#     response = requests.get(url)
#     response.raise_for_status()
#     # Process the play-by-play data as needed
#     return response.text

# def main(start_date, end_date):
#     """
#     Main function to scrape play-by-play data for all games between start_date and end_date.
#     """
#     current_date = start_date
#     while current_date <= end_date:
#         try:
#             html_content = fetch_schedule_page(current_date)
#             game_ids = extract_game_ids(html_content)
#             for game_id in game_ids:
#                 play_by_play_data = fetch_play_by_play_data(game_id)
#                 # Process or store the play-by-play data as needed
#                 print(f"Processed play-by-play data for game ID: {game_id}")
#         except requests.HTTPError as e:
#             print(f"Failed to fetch data for {current_date.strftime('%Y-%m-%d')}: {e}")
#         current_date += timedelta(days=1)

# if __name__ == "__main__":
#     # Define the date range for the season
#     start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
#     end_date = datetime.strptime("2025-03-08", "%Y-%m-%d")
#     main(start_date, end_date)


<!DOCTYPE html>
<html lang="en" dir="ltr" prefix="og: https://ogp.me/ns#">
  <head>
    <meta charset="utf-8" />
<script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={xpid:"UgYCUlJACQMDVVZVDgIF",licenseKey:"b4c08c65aa",applicationID:"103177932"};;/*! For license information please see nr-loader-full-1.278.3.min.js.LICENSE.txt */
<script type="text/javascript">var turner_metadata = {"page_name":"ncaa college men's ice hockey scores, schedule","keywords":"scores, scoreboard, college, all-conf, 10\/04\/2024, october 4th, 2024, d1, di, division i, icehockey, ice hockey, icehockey-men, men's ice hockey","url":"https:\/\/www.ncaa.com\/scoreboard\/icehockey-men\/d1\/2024\/10\/04","division":"d1","sport":"icehockey","sport_div":"icehockey-men","content_type":"scoreboard","section":"icehockey-men-d1","subsection":"icehockey-men","gender":"men","date_created":"2018-08-21T15:00:00Z","date_published":"2025-01

KeyboardInterrupt: 

In [None]:
# Store the data in a dataframe
# Display the first 5 rows of the dataframe



    


## Using Custom API to Call NCAA.com
- project developed by henrygd - https://github.com/henrygd/ncaa-api

Uses his custom built API to get JSON response from NCAA.com
- can host own server for large projects for now I am using his public link

In [4]:
## TEST GAME MSU vs MINN - 1/26/25
msu_minn_ass_whoppin = "6344241"

game_id = msu_minn_ass_whoppin


### Use HENRYGD's public API to fetch data


base_url = "https://ncaa-api.henrygd.me/"
url = f"{base_url}game/{game_id}/play-by-play"



print(url)

https://ncaa-api.henrygd.me/game/6344241/play-by-play


In [8]:
# Request the url and examine the response
import requests

response = request.get(url)

print(response)

NameError: name 'request' is not defined