# NCAA.com Play-by-play Data Scraper
-

In [104]:
# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [105]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import os
import re

from config import recent_clean_db, last_game_date

# File paths
data_folder = os.path.join('..', 'data/') # Data Folder Path
temp_folder = os.path.join('..', 'TEMP/',) # Temp Folder Path



In [106]:
# schedule_df.head()

## Scrape the NCAA.com schedule section
- Creates a dataframe with Data - Teams and Game_ID_Number

- Turned off because it takes 6-7 minutes to run and we can use a previously scraped and locally stored schedule

In [107]:


# # Base URL for NCAA schedule
# base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"

# # Function to scrape a single day's schedule with rate limiting
# def scrape_schedule(date):
#     url = f"{base_url}/{date}/all-conf"
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to fetch data for {date}: {response.status_code}")
#         return []

#     soup = BeautifulSoup(response.text, 'html.parser')
#     games = []

#     # Locate game containers based on the provided HTML structure
#     game_containers = soup.select('#scoreboardGames .gamePod')
#     for game in game_containers:
#         try:
#             game_id = game.select_one('a.gamePod-link')['href'].split('/')[-1]
#             teams = game.select('ul.gamePod-game-teams li')
            
#             home_team = teams[0].select_one('span.gamePod-game-team-name').text.strip()
#             away_team = teams[1].select_one('span.gamePod-game-team-name').text.strip()
            
#             games.append({
#                 'Date': date,
#                 'Home_Team': home_team,
#                 'Away_Team': away_team,            

#                 'game_id_number': game_id
#             })
#         except Exception as e:
#             print(f"Error processing game: {e}")

#     return games

# # Function to scrape a range of dates with rate limiting and progress bar
# def scrape_schedule_range(start_date, end_date):
#     date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y/%m/%d')
#     all_games = []
    
#     # Progress bar setup
#     for date in tqdm(date_range, desc="Scraping schedule", unit="day"):
#         games = scrape_schedule(date)
#         all_games.extend(games)
#         time.sleep(1)  # Rate limiter: 1-second delay between requests

#     return pd.DataFrame(all_games)

# # Example usage
# start_date = "2024-10-04"  # First day of the season
# end_date = "2025-03-08"    # Last regular season day
# schedule_df = scrape_schedule_range(start_date, end_date)

# # Display the resulting dataframe
# schedule_df

### Save / Load Local Copy of Schedule

In [108]:
### Save the schedule to a CSV file for later use
# schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'), index=False)

# Load the locally stored schedule to avoid having to scrape again
schedule_df = pd.read_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'))


In [109]:
schedule_df.head(20)

Unnamed: 0,Date,Home_Team_Away_Team,game_id_number
0,2024/10/04,Michigan St. vs Lake Superior St.,6344272
1,2024/10/04,Minnesota St. vs Michigan,6344249
2,2024/10/04,Bowling Green vs Mercyhurst,6344336
3,2024/10/04,Colgate vs UConn,6344337
4,2024/10/04,Miami (OH) vs Ferris St.,6344354
5,2024/10/04,Arizona St. vs Air Force,6344335
6,2024/10/05,Michigan St. vs Lake Superior St.,6344273
7,2024/10/05,Minnesota St. vs Michigan,6344250
8,2024/10/05,Bemidji St. vs Minn. Duluth,6344341
9,2024/10/05,Massachusetts vs Bentley,6344346


### Data Transgformation
- NOT NESS IF WORKING WITH NEW SCRAPE 
    - Seperate team column into Home_Team, Away_Team

In [110]:
# Seperate team column into Home_Team, Away_Team

def handle_home_away(schedule_df):
    # Split Home_Team_Away_Team into Home_Team and Away_Team
    schedule_df[['Away_Team', 'Home_Team']] = schedule_df['Home_Team_Away_Team'].str.split(' vs ', expand=True)
    
    # Remove punctuation and strip whitespace
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())

    # Drop the original column
    schedule_df = schedule_df.drop(columns=['Home_Team_Away_Team'])
    return schedule_df

# call the function
schedule_df = handle_home_away(schedule_df)
schedule_df.head()

Unnamed: 0,Date,game_id_number,Away_Team,Home_Team
0,2024/10/04,6344272,Michigan St,Lake Superior St
1,2024/10/04,6344249,Minnesota St,Michigan
2,2024/10/04,6344336,Bowling Green,Mercyhurst
3,2024/10/04,6344337,Colgate,UConn
4,2024/10/04,6344354,Miami OH,Ferris St


### Load School info and replace ncaa_names with standard Team names from existing data

In [111]:
# Load School info 
school_info_path = os.path.join(data_folder, 'arena_school_info.csv')
school_info_df = pd.read_csv(school_info_path)
# school_info_df.head() # Check data

# Function to map team names to standardized names
def map_team_names(schedule_df, school_info_df):
    # Create a mapping dictionary from school_info_df
    team_mapping = {
        re.sub(r'[^\w\s]', '', row['ncaa_name']).strip(): row['Team']
        for _, row in school_info_df.iterrows()
    }

    # Map Home_Team and Away_Team to standardized names
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))

    return schedule_df

# Call the function
schedule_df = map_team_names(schedule_df, school_info_df)

# Check the data
schedule_df.head(20)


Unnamed: 0,Date,game_id_number,Away_Team,Home_Team
0,2024/10/04,6344272,Michigan State,Lake Superior
1,2024/10/04,6344249,Minnesota State,Michigan
2,2024/10/04,6344336,Bowling Green,Mercyhurst
3,2024/10/04,6344337,Colgate,Connecticut
4,2024/10/04,6344354,Miami,Ferris State
5,2024/10/04,6344335,Arizona State,Air Force
6,2024/10/05,6344273,Michigan State,Lake Superior
7,2024/10/05,6344250,Minnesota State,Michigan
8,2024/10/05,6344341,Bemidji State,Minnesota Duluth
9,2024/10/05,6344346,Massachusetts,Bentley


### Create a New Column with Game_ID to match with the rest of the Database

In [112]:
# Function to create a unique Game_ID
def create_game_id(schedule_df):
    schedule_df['Game_ID'] = schedule_df.apply(
        lambda row: f"{row['Date'].replace('/', '-')}-{row['Away_Team']}-{row['Home_Team']}", axis=1
    )
    return schedule_df

# Call the function
schedule_df = create_game_id(schedule_df)

In [113]:
# schedule_df.head(20)

## Get Play By Play JSONs
- 

In [114]:
# breakpoint()

## Using Custom API to Call NCAA.com
- project developed by henrygd - https://github.com/henrygd/ncaa-api

Uses his custom built API to get JSON response from NCAA.com
- can host own server for large projects for now I am using his public link

In [115]:
# import requests
# from datetime import datetime

# # Base URL for the custom API
# base_url = "https://ncaa-api.henrygd.me/game"

# # Function to get play-by-play JSON for a single game
# def get_play_by_play(game_id_number):
#     url = f"{base_url}/{game_id_number}/play-by-play"
#     try:
#         response = requests.get(url)
#         response.raise_for_status()  # Raise HTTPError for bad responses
#         return response.json()
#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching data for Game ID {game_id_number}: {e}")
#         return None

# # Function to fetch JSON data for all completed games
# def fetch_play_by_play_data(schedule_df):
#     # Filter for games that have already taken place
#     today = datetime.now().strftime('%Y-%m-%d')
#     completed_games = schedule_df[schedule_df['Date'] < today].copy()

#     # Initialize a new column for play-by-play JSON
#     completed_games['Play_By_Play_JSON'] = None

#     for index, row in completed_games.iterrows():
#         game_id_number = row['game_id_number']
#         json_data = get_play_by_play(game_id_number)
#         completed_games.at[index, 'Play_By_Play_JSON'] = json_data

#     return completed_games

# # Fetch and update the dataframe with play-by-play JSONs
# updated_schedule_df = fetch_play_by_play_data(schedule_df)

# # Check the updated dataframe
# updated_schedule_df.head()


In [116]:
## Save the dataframe in a way that doesn't cut off the json data - CSV cuts off the json data
## Use Pickle
# Save the updated dataframe to a pickle file
# updated_schedule_df.to_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))

## Load pickle file to avoid having to scrape again
# Load the updated dataframe from a pickle file
updated_schedule_df = pd.read_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))



### Transform Play by Play JSONs

#### Extract all decriptions into a text file to study
- Use this to make an abbr and alternate name dictionary for a find and replace

In [117]:
### SERVED IT'S PURPOSE - UNIQUE NAMES ADDED TO school_info_df

# # Function to extract all descriptions from the JSON data
# def extract_all_descriptions(schedule_df):
#     descriptions = []

#     for _, row in schedule_df.iterrows():
#         json_data = row['Play_By_Play_JSON']
#         if json_data:
#             for period in json_data['periods']:
#                 for play in period['playStats']:
#                     description = play.get('visitorText') or play.get('homeText')
#                     if description:
#                         descriptions.append(description.strip())
#     return descriptions

# # Extract all descriptions
# all_descriptions = extract_all_descriptions(updated_schedule_df)

# # Save descriptions to a text file
# output_file_path = "../TEMP/all_descriptions.txt"
# with open(output_file_path, "w", encoding="utf-8") as file:
#     for description in all_descriptions:
#         file.write(description + "\n")

# print(f"Descriptions have been saved to {output_file_path}")


In [118]:
school_info_df.head()
# Create a dictionary mapping the alternate names (ncaa_data_alts) to the standardized names


Unnamed: 0,Team,Arena,Capacity,Sheet_length,Sheet_width,School,Latitude,Longitude,hex1,hex2,hex3,simp_color,logo_abv,abv,ncaa_name,ncaa_data_alts
0,Air Force,Cadet Ice Arena,2470,200,85,Air Force,39.013739,-104.883727,3087,8a8d8f,,,afa,Air Force,Air Force,"AIRFOR, Air Force"
1,Alaska,Carlson Center,4595,200,100,Alaska,64.842124,-147.763841,236192,ffcd00,,,akf,Alaska,Alas Anchorage,"AK FBK, Alas. Fairbanks"
2,Alaska Anchorage,Avis Alaska Sports Complex,800,200,85,Alaska-Anchorage,61.205536,-149.872737,00583d,ffc425,,,aka,UAA,Alas Fairbanks,"AK ANC, Alas. Anchorage"
3,American Int'l,MassMutual Center,6866,200,85,American Int'l,42.118003,-72.554326,0,ffb60f,,,aic,AIC,American Intl,"AM INT, American Int'l"
4,Arizona State,Mullett Arena,5000,200,85,Arizona State,33.447156,-111.910867,8c1d40,ffc627,,,asu,ASU,Arizona St,"AZ ST, Arizona St."


In [119]:
import pandas as pd
import re
import unicodedata

# Function to convert period and time to continuous time
def convert_to_continuous_time(period, time):
    period_offsets = {'1': 0, '2': 20, '3': 40, 'OT': 60}
    minutes, seconds = map(int, time.split(':'))
    elapsed_time = (20 - minutes) * 60 + -seconds
    offset = period_offsets.get(period, 0) * 60
    return offset + elapsed_time

# Function to normalize names to handle accents and special characters
def normalize_name(name):
    if not name:
        return None
    # Normalize Unicode accents and remove non-ASCII characters
    normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    return normalized

# Enhanced player name formatting function
def clean_player_name(name):
    """
    Converts a name from "Last, First" to "First Last", handling punctuation and normalization.
    Example: "Hughes, T.J." -> "T.J. Hughes"
    """
    if not name:
        return None
    name = normalize_name(name)
    parts = [p.strip() for p in name.split(',')]
    if len(parts) == 2:
        last, first = parts
        return f"{first} {last}".strip()
    return name

# Function to parse play-by-play descriptions
# Improved to handle team abbreviations and player names with issues
def parse_description(description):
    """
    Parse a single play-by-play description into structured fields.
    """
    desc_lower = description.lower().strip()
    parsed = {
        "Event_type": "Other",
        "Primary_player": None,
        "Primary_team": None,
        "Secondary_player": None,
        "Secondary_team": None,
        "Outcome": None,
    }

    # Normalize known team abbreviations
    team_map = {
        'michst': 'MICHST',
        'lake sup': 'LK SUP',
        'lk sup': 'LK SUP',
        'michigan state': 'MICHST',
        'lake superior': 'LK SUP'
    }

    for key, value in team_map.items():
        desc_lower = desc_lower.replace(key, value.lower())

    # --- Faceoff ---
    if "faceoff" in desc_lower:
        parsed["Event_type"] = "Faceoff"
        faceoff_pattern = (
            r"Faceoff\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"vs\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"won by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\."
        )
        match = re.search(faceoff_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Secondary_player"] = clean_player_name(match.group(2))
            parsed["Primary_team"] = match.group(3).strip()
            parsed["Outcome"] = "won"
        return parsed

    # --- Goal ---
    if "goal by" in desc_lower:
        parsed["Event_type"] = "Goal"
        goal_scorer_pattern = r"Goal by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)"
        match = re.search(goal_scorer_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
        return parsed

    # --- Penalty ---
    if desc_lower.startswith("penalty on"):
        parsed["Event_type"] = "Penalty"
        penalty_pattern = (
            r"Penalty on\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-ÿ'\.\- ]+)\s+"
            r"([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(\d+) minutes for (.+)"
        )
        match = re.search(penalty_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
            parsed["Penalty_duration"] = match.group(3).strip()
            parsed["Penalty_type"] = match.group(4).strip()
        return parsed

    # --- Shot ---
    if "shot by" in desc_lower:
        parsed["Event_type"] = "Shot"
        shot_pattern = r"Shot by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(.+)"
        match = re.search(shot_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
        return parsed

    return parsed

# Function to transform a single game's JSON data into a dataframe
def transform_single_game(json_data, game_id):
    rows = []

    for period in json_data['periods']:
        period_number = period['periodNumber']
        for play in period['playStats']:
            row = {
                'Game_ID': game_id,
                'Period': period_number,
                'Time': play['time'],
                'Description': play['visitorText'] or play['homeText'],
                'Score': play['score']
            }
            rows.append(row)

    game_df = pd.DataFrame(rows)

    # Convert period and time to continuous time
    game_df['Period'] = game_df['Period'].replace({'1st': '1', '2nd': '2', '3rd': '3', 'OT': 'OT'})
    game_df['Time'] = game_df.apply(lambda row: convert_to_continuous_time(row['Period'], row['Time']), axis=1)

    # Parse descriptions
    parsed_descriptions = game_df['Description'].apply(parse_description)
    parsed_df = pd.DataFrame(parsed_descriptions.tolist())

    # Combine with original game_df
    return pd.concat([game_df, parsed_df], axis=1)

# Function to process all games and combine into a single dataframe
def process_all_games(schedule_df):
    all_games = []

    for _, row in schedule_df.iterrows():
        game_id = row['Game_ID']
        json_data = row['Play_By_Play_JSON']

        if json_data:
            game_df = transform_single_game(json_data, game_id)
            all_games.append(game_df)

    return pd.concat(all_games, ignore_index=True)

# Example usage
# Assuming `updated_schedule_df` is the dataframe containing the JSON play-by-play data
final_pbp_df = process_all_games(updated_schedule_df)

# Display the resulting dataframe
final_pbp_df.head()

Unnamed: 0,Game_ID,Period,Time,Description,Score,Event_type,Primary_player,Primary_team,Secondary_player,Secondary_team,Outcome,Penalty_duration,Penalty_type
0,2024-10-04-Michigan State-Lake Superior,1,0,"Faceoff Shoudy, Tiernan vs Herrington, John wo...",,Faceoff,Tiernan Shoudy,MICHST,John Herrington,,won,,
1,2024-10-04-Michigan State-Lake Superior,1,0,Rorke Applebee at goalie for Lake Superior St.,,Other,,,,,,,
2,2024-10-04-Michigan State-Lake Superior,1,0,Trey Augustine at goalie for Michigan St.,,Other,,,,,,,
3,2024-10-04-Michigan State-Lake Superior,1,46,"Shot by MICHST Shoudy, Tiernan WIDE.",,Shot,MICHST,"Shoudy, Tiernan WIDE.",,,,,
4,2024-10-04-Michigan State-Lake Superior,1,67,"Shot by MICHST Lukashevich, Vladislav BLOCKED ...",,Shot,MICHST,"Lukashevich, Vladislav BLOCKED by Bakos, Timo.",,,,,


In [120]:
# Examine the data
final_pbp_df.info()
# Value counts
final_pbp_df['Event_type'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108093 entries, 0 to 108092
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Game_ID           108093 non-null  object
 1   Period            108093 non-null  object
 2   Time              108093 non-null  int64 
 3   Description       108093 non-null  object
 4   Score             108093 non-null  object
 5   Event_type        108093 non-null  object
 6   Primary_player    93478 non-null   object
 7   Primary_team      90638 non-null   object
 8   Secondary_player  31625 non-null   object
 9   Secondary_team    0 non-null       object
 10  Outcome           31625 non-null   object
 11  Penalty_duration  3833 non-null    object
 12  Penalty_type      3833 non-null    object
dtypes: int64(1), object(12)
memory usage: 10.7+ MB


Event_type
Shot       55180
Faceoff    32112
Other      13946
Penalty     3992
Goal        2863
Name: count, dtype: int64

In [121]:
## Save the dataframe to a CSV file
final_pbp_df.to_csv(os.path.join(temp_folder, 'pbp_data_test_1.csv'), index=False)

In [123]:
### Save Updated Schedule DF (With PbP JSONs) to csv to avoid scraping for new tests

updated_schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_PbP_JSON.csv'), index=False)
