# NCAA.com Play-by-play Data Scraper
-

In [57]:
# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [58]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import os
import re

from config import recent_clean_db, last_game_date

# File paths
data_folder = os.path.join('..', 'data/') # Data Folder Path
temp_folder = os.path.join('..', 'TEMP/',) # Temp Folder Path



In [59]:
# schedule_df.head()

## Scrape the NCAA.com schedule section
- Creates a dataframe with Data - Teams and Game_ID_Number

- Turned off because it takes 6-7 minutes to run and we can use a previously scraped and locally stored schedule

In [60]:


# # Base URL for NCAA schedule
# base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"

# # Function to scrape a single day's schedule with rate limiting
# def scrape_schedule(date):
#     url = f"{base_url}/{date}/all-conf"
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to fetch data for {date}: {response.status_code}")
#         return []

#     soup = BeautifulSoup(response.text, 'html.parser')
#     games = []

#     # Locate game containers based on the provided HTML structure
#     game_containers = soup.select('#scoreboardGames .gamePod')
#     for game in game_containers:
#         try:
#             game_id = game.select_one('a.gamePod-link')['href'].split('/')[-1]
#             teams = game.select('ul.gamePod-game-teams li')
            
#             home_team = teams[0].select_one('span.gamePod-game-team-name').text.strip()
#             away_team = teams[1].select_one('span.gamePod-game-team-name').text.strip()
            
#             games.append({
#                 'Date': date,
#                 'Home_Team': home_team,
#                 'Away_Team': away_team,            

#                 'game_id_number': game_id
#             })
#         except Exception as e:
#             print(f"Error processing game: {e}")

#     return games

# # Function to scrape a range of dates with rate limiting and progress bar
# def scrape_schedule_range(start_date, end_date):
#     date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y/%m/%d')
#     all_games = []
    
#     # Progress bar setup
#     for date in tqdm(date_range, desc="Scraping schedule", unit="day"):
#         games = scrape_schedule(date)
#         all_games.extend(games)
#         time.sleep(1)  # Rate limiter: 1-second delay between requests

#     return pd.DataFrame(all_games)

# # Example usage
# start_date = "2024-10-04"  # First day of the season
# end_date = "2025-03-08"    # Last regular season day
# schedule_df = scrape_schedule_range(start_date, end_date)

# # Display the resulting dataframe
# schedule_df

### Save / Load Local Copy of Schedule

In [61]:
### Save the schedule to a CSV file for later use
# schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'), index=False)

# Load the locally stored schedule to avoid having to scrape again
schedule_df = pd.read_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'))


In [62]:
schedule_df.head(20)

Unnamed: 0,Date,Home_Team_Away_Team,game_id_number
0,2024/10/04,Michigan St. vs Lake Superior St.,6344272
1,2024/10/04,Minnesota St. vs Michigan,6344249
2,2024/10/04,Bowling Green vs Mercyhurst,6344336
3,2024/10/04,Colgate vs UConn,6344337
4,2024/10/04,Miami (OH) vs Ferris St.,6344354
5,2024/10/04,Arizona St. vs Air Force,6344335
6,2024/10/05,Michigan St. vs Lake Superior St.,6344273
7,2024/10/05,Minnesota St. vs Michigan,6344250
8,2024/10/05,Bemidji St. vs Minn. Duluth,6344341
9,2024/10/05,Massachusetts vs Bentley,6344346


### Data Transgformation
- NOT NESS IF WORKING WITH NEW SCRAPE 
    - Seperate team column into Home_Team, Away_Team

In [63]:
# Seperate team column into Home_Team, Away_Team

def handle_home_away(schedule_df):
    # Split Home_Team_Away_Team into Home_Team and Away_Team
    schedule_df[['Away_Team', 'Home_Team']] = schedule_df['Home_Team_Away_Team'].str.split(' vs ', expand=True)
    
    # Remove punctuation and strip whitespace
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())

    # Drop the original column
    schedule_df = schedule_df.drop(columns=['Home_Team_Away_Team'])
    return schedule_df

# call the function
schedule_df = handle_home_away(schedule_df)
schedule_df.head()

Unnamed: 0,Date,game_id_number,Away_Team,Home_Team
0,2024/10/04,6344272,Michigan St,Lake Superior St
1,2024/10/04,6344249,Minnesota St,Michigan
2,2024/10/04,6344336,Bowling Green,Mercyhurst
3,2024/10/04,6344337,Colgate,UConn
4,2024/10/04,6344354,Miami OH,Ferris St


### Load School info and replace ncaa_names with standard Team names from existing data

In [64]:
# Load School info 
school_info_path = os.path.join(data_folder, 'arena_school_info.csv')
school_info_df = pd.read_csv(school_info_path)
# school_info_df.head() # Check data

# Function to map team names to standardized names
def map_team_names(schedule_df, school_info_df):
    # Create a mapping dictionary from school_info_df
    team_mapping = {
        re.sub(r'[^\w\s]', '', row['ncaa_name']).strip(): row['Team']
        for _, row in school_info_df.iterrows()
    }

    # Map Home_Team and Away_Team to standardized names
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))

    return schedule_df

# Call the function
schedule_df = map_team_names(schedule_df, school_info_df)

# Check the data
schedule_df.head(20)


Unnamed: 0,Date,game_id_number,Away_Team,Home_Team
0,2024/10/04,6344272,Michigan State,Lake Superior
1,2024/10/04,6344249,Minnesota State,Michigan
2,2024/10/04,6344336,Bowling Green,Mercyhurst
3,2024/10/04,6344337,Colgate,Connecticut
4,2024/10/04,6344354,Miami,Ferris State
5,2024/10/04,6344335,Arizona State,Air Force
6,2024/10/05,6344273,Michigan State,Lake Superior
7,2024/10/05,6344250,Minnesota State,Michigan
8,2024/10/05,6344341,Bemidji State,Minnesota Duluth
9,2024/10/05,6344346,Massachusetts,Bentley


### Create a New Column with Game_ID to match with the rest of the Database

In [65]:
# Function to create a unique Game_ID
def create_game_id(schedule_df):
    schedule_df['Game_ID'] = schedule_df.apply(
        lambda row: f"{row['Date'].replace('/', '-')}-{row['Away_Team']}-{row['Home_Team']}", axis=1
    )
    return schedule_df

# Call the function
schedule_df = create_game_id(schedule_df)

In [66]:
schedule_df.head(20)

Unnamed: 0,Date,game_id_number,Away_Team,Home_Team,Game_ID
0,2024/10/04,6344272,Michigan State,Lake Superior,2024-10-04-Michigan State-Lake Superior
1,2024/10/04,6344249,Minnesota State,Michigan,2024-10-04-Minnesota State-Michigan
2,2024/10/04,6344336,Bowling Green,Mercyhurst,2024-10-04-Bowling Green-Mercyhurst
3,2024/10/04,6344337,Colgate,Connecticut,2024-10-04-Colgate-Connecticut
4,2024/10/04,6344354,Miami,Ferris State,2024-10-04-Miami-Ferris State
5,2024/10/04,6344335,Arizona State,Air Force,2024-10-04-Arizona State-Air Force
6,2024/10/05,6344273,Michigan State,Lake Superior,2024-10-05-Michigan State-Lake Superior
7,2024/10/05,6344250,Minnesota State,Michigan,2024-10-05-Minnesota State-Michigan
8,2024/10/05,6344341,Bemidji State,Minnesota Duluth,2024-10-05-Bemidji State-Minnesota Duluth
9,2024/10/05,6344346,Massachusetts,Bentley,2024-10-05-Massachusetts-Bentley


In [67]:
# # Create a list of unique teams from both Home and Away columns
# teams = schedule_df['Home_Team'].unique().tolist() + schedule_df['Away_Team'].unique().tolist()
# # Only keep unique values
# teams = list(set(teams))
# # sort alphabetically
# teams = sorted(teams)

# print(teams)

In [68]:
breakpoint()

In [69]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Original Attempt, not really useful in current form
# def fetch_schedule_page(date):
#     """
#     Fetches the NCAA men's ice hockey schedule page for a given date.
#     """
#     base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"
#     url = f"{base_url}/{date.strftime('%Y/%m/%d')}/all-conf"
#     response = requests.get(url)
#     response.raise_for_status()
#     return response.text

# # Test the fetch_schedule_page function
# date = datetime.strptime("2024-10-04", "%Y-%m-%d")
# html_content = fetch_schedule_page(date)
# print(html_content)


# def extract_game_ids(html_content):
#     """
#     Extracts game IDs from the schedule page HTML content.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')
#     game_links = soup.find_all('a', href=True)
#     game_ids = set()
#     for link in game_links:
#         href = link['href']
#         if '/game/' in href:
#             parts = href.split('/')
#             if len(parts) > 2 and parts[2].isdigit():
#                 game_ids.add(parts[2])
#     return list(game_ids)

# def construct_play_by_play_url(game_id):
#     """
#     Constructs the play-by-play URL for a given game ID.
#     """
#     return f"https://www.ncaa.com/game/{game_id}/play-by-play"

# ### Construct a dataframe of all games for the season Date, GameID, URL, Home Team, Away Team
# def main(start_date, end_date):
#     """
#     Main function to scrape schedule data for all games between start_date and end_date.
#     """
#     current_date = start_date
#     while current_date <= end_date:
#         try:
#             html_content = fetch_schedule_page(current_date)
#             game_ids = extract_game_ids(html_content)
#             for game_id in game_ids:
#                 play_by_play_url = construct_play_by_play_url(game_id)
#                 # Process or store the play-by-play URL as needed
#                 print(f"Game ID: {game_id}, URL: {play_by_play_url}")
#         except requests.HTTPError as e:
#             print(f"Failed to fetch data for {current_date.strftime('%Y-%m-%d')}: {e}")
#         current_date += timedelta(days=1)

# if __name__ == "__main__":
#     # Define the date range for the season
#     start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
#     end_date = datetime.strptime("2024-10-31", "%Y-%m-%d")
#     main(start_date, end_date)

# # Display the first 5 games in the data
# print(game_ids[:5])



In [70]:
# Store the data in a dataframe
# Display the first 5 rows of the dataframe



    


## Using Custom API to Call NCAA.com
- project developed by henrygd - https://github.com/henrygd/ncaa-api

Uses his custom built API to get JSON response from NCAA.com
- can host own server for large projects for now I am using his public link

In [71]:
## TEST GAME MSU vs MINN - 1/26/25
msu_minn_ass_whoppin = "6344241"

game_id = msu_minn_ass_whoppin


### Use HENRYGD's public API to fetch data


base_url = "https://ncaa-api.henrygd.me/"
url = f"{base_url}game/{game_id}/play-by-play"



print(url)

https://ncaa-api.henrygd.me/game/6344241/play-by-play


In [72]:
# Request the url and examine the response
import requests

response = request.get(url)

print(response)

NameError: name 'request' is not defined