# NCAA.com Play-by-play Data Scraper
-

In [1]:
# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [2]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import os
import re
import json

from config import recent_clean_db, last_game_date

# File paths
data_folder = os.path.join('..', 'data/') # Data Folder Path
temp_folder = os.path.join('..', 'TEMP/',) # Temp Folder Path



In [3]:
# schedule_df.head()

## Scrape the NCAA.com schedule section
- Creates a dataframe with Data - Teams and Game_ID_Number

- Turned off because it takes 6-7 minutes to run and we can use a previously scraped and locally stored schedule

In [4]:


# # Base URL for NCAA schedule
# base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"

# # Function to scrape a single day's schedule with rate limiting
# def scrape_schedule(date):
#     url = f"{base_url}/{date}/all-conf"
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to fetch data for {date}: {response.status_code}")
#         return []

#     soup = BeautifulSoup(response.text, 'html.parser')
#     games = []

#     # Locate game containers based on the provided HTML structure
#     game_containers = soup.select('#scoreboardGames .gamePod')
#     for game in game_containers:
#         try:
#             game_id = game.select_one('a.gamePod-link')['href'].split('/')[-1]
#             teams = game.select('ul.gamePod-game-teams li')
            
#             home_team = teams[0].select_one('span.gamePod-game-team-name').text.strip()
#             away_team = teams[1].select_one('span.gamePod-game-team-name').text.strip()
            
#             games.append({
#                 'Date': date,
#                 'Home_Team': home_team,
#                 'Away_Team': away_team,            

#                 'game_id_number': game_id
#             })
#         except Exception as e:
#             print(f"Error processing game: {e}")

#     return games

# # Function to scrape a range of dates with rate limiting and progress bar
# def scrape_schedule_range(start_date, end_date):
#     date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y/%m/%d')
#     all_games = []
    
#     # Progress bar setup
#     for date in tqdm(date_range, desc="Scraping schedule", unit="day"):
#         games = scrape_schedule(date)
#         all_games.extend(games)
#         time.sleep(1)  # Rate limiter: 1-second delay between requests

#     return pd.DataFrame(all_games)

# # Example usage
# start_date = "2024-10-04"  # First day of the season
# end_date = "2025-03-08"    # Last regular season day
# schedule_df = scrape_schedule_range(start_date, end_date)

# # Display the resulting dataframe
# schedule_df

### Save / Load Local Copy of Schedule

In [5]:
### Save the schedule to a CSV file for later use
# schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'), index=False)

# Load the locally stored schedule to avoid having to scrape again
schedule_df = pd.read_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'))


In [None]:
schedule_df.head(20)

### Data Transgformation
- NOT NESS IF WORKING WITH NEW SCRAPE 
    - Seperate team column into Home_Team, Away_Team

In [None]:
# Seperate team column into Home_Team, Away_Team

def handle_home_away(schedule_df):
    # Split Home_Team_Away_Team into Home_Team and Away_Team
    schedule_df[['Away_Team', 'Home_Team']] = schedule_df['Home_Team_Away_Team'].str.split(' vs ', expand=True)
    
    # Remove punctuation and strip whitespace
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())

    # Drop the original column
    schedule_df = schedule_df.drop(columns=['Home_Team_Away_Team'])
    return schedule_df

# call the function
schedule_df = handle_home_away(schedule_df)
schedule_df.head()

### Load School info and replace ncaa_names with standard Team names from existing data

In [None]:
# Load School info 
school_info_path = os.path.join(data_folder, 'arena_school_info.csv')
school_info_df = pd.read_csv(school_info_path)
# school_info_df.head() # Check data

# Function to map team names to standardized names
def map_team_names(schedule_df, school_info_df):
    # Create a mapping dictionary from school_info_df
    team_mapping = {
        re.sub(r'[^\w\s]', '', row['ncaa_name']).strip(): row['Team']
        for _, row in school_info_df.iterrows()
    }

    # Map Home_Team and Away_Team to standardized names
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))

    return schedule_df

# Call the function
schedule_df = map_team_names(schedule_df, school_info_df)

# Check the data
schedule_df.head(20)


### Create a New Column with Game_ID to match with the rest of the Database

In [9]:
# Function to create a unique Game_ID
def create_game_id(schedule_df):
    schedule_df['Game_ID'] = schedule_df.apply(
        lambda row: f"{row['Date'].replace('/', '-')}-{row['Away_Team']}-{row['Home_Team']}", axis=1
    )
    return schedule_df

# Call the function
schedule_df = create_game_id(schedule_df)

In [10]:
# schedule_df.head(20)

## Get Play By Play JSONs
- 

In [11]:
# breakpoint()

## Using Custom API to Call NCAA.com
- project developed by henrygd - https://github.com/henrygd/ncaa-api

Uses his custom built API to get JSON response from NCAA.com
- can host own server for large projects for now I am using his public link

In [12]:
# import requests
# from datetime import datetime

# # Base URL for the custom API
# base_url = "https://ncaa-api.henrygd.me/game"

# # Function to get play-by-play JSON for a single game
# def get_play_by_play(game_id_number):
#     url = f"{base_url}/{game_id_number}/play-by-play"
#     try:
#         response = requests.get(url)
#         response.raise_for_status()  # Raise HTTPError for bad responses
#         return response.json()
#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching data for Game ID {game_id_number}: {e}")
#         return None

# # Function to fetch JSON data for all completed games
# def fetch_play_by_play_data(schedule_df):
#     # Filter for games that have already taken place
#     today = datetime.now().strftime('%Y-%m-%d')
#     completed_games = schedule_df[schedule_df['Date'] < today].copy()

#     # Initialize a new column for play-by-play JSON
#     completed_games['Play_By_Play_JSON'] = None

#     for index, row in completed_games.iterrows():
#         game_id_number = row['game_id_number']
#         json_data = get_play_by_play(game_id_number)
#         completed_games.at[index, 'Play_By_Play_JSON'] = json_data

#     return completed_games

# # Fetch and update the dataframe with play-by-play JSONs
# updated_schedule_df = fetch_play_by_play_data(schedule_df)

# # Check the updated dataframe
# updated_schedule_df.head()


In [13]:
## Save the dataframe in a way that doesn't cut off the json data - CSV cuts off the json data
## Use Pickle
# Save the updated dataframe to a pickle file
# updated_schedule_df.to_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))

## Load pickle file to avoid having to scrape again
# Load the updated dataframe from a pickle file
updated_schedule_df = pd.read_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))



### Transform Play by Play JSONs

#### Extract all decriptions into a text file to study
- Use this to make an abbr and alternate name dictionary for a find and replace

##### Deep

###### Team Standardization

In [14]:
# Create team name mapping from school_info_df
team_mapping = {}
for _, row in school_info_df.iterrows():
    standard_name = row['Team']
    alternatives = [a.strip() for a in row['ncaa_data_alts'].split(',')]
    for alt in alternatives:
        team_mapping[alt.lower()] = standard_name

team_mapping
team_map = team_mapping


In [15]:

# def clean_player_name(name):
#     """Handle special characters and formatting in player names"""
#     if pd.isna(name) or not isinstance(name, str):
#         return None
#     try:
#         decoded = name.encode('latin-1').decode('utf-8')
#     except:
#         decoded = name
#     return normalize('NFKD', decoded.strip()).encode('ascii', 'ignore').decode()

In [16]:
# def standardize_teams(text):
#     """Replace team name variations with standardized abbreviations"""
#     text_lower = text.lower()
#     for variation, standard in team_mapping.items():
#         text_lower = re.sub(r'\b' + re.escape(variation) + r'\b', standard, text_lower, flags=re.IGNORECASE)
#     return text_lower.upper()

In [17]:
# def parse_description(description):
#     """Improved parsing function with team standardization"""
#     parsed = {
#         "Event_type": "Other",
#         "Primary_player": None,
#         "Primary_team": None,
#         "Secondary_player": None,
#         "Secondary_team": None,
#         "Outcome": None,
#         "Penalty_duration": None,
#         "Penalty_type": None
#     }

#     if not isinstance(description, str):
#         return parsed

#     # Standardize team names first
#     desc = standardize_teams(description)
    
#     # Faceoff parsing
#     faceoff_match = re.search(
#         r"Faceoff\s+([A-Za-zÀ-ÖØ-öø-ÿ' ,.-]+)\s+vs\s+([A-Za-zÀ-ÖØ-öø-ÿ' ,.-]+)\s+won by\s+([A-Z]{3,5})",
#         desc
#     )
#     if faceoff_match:
#         parsed.update({
#             "Event_type": "Faceoff",
#             "Primary_player": clean_player_name(faceoff_match.group(1).replace(',', '').title()),
#             "Secondary_player": clean_player_name(faceoff_match.group(2).replace(',', '').title()),
#             "Primary_team": faceoff_match.group(3),
#             "Outcome": "won"
#         })
#         return parsed

#     # Goal parsing
#     goal_match = re.search(r"Goal by\s+([A-Za-zÀ-ÖØ-öø-ÿ' ,.-]+)\s*\(", desc)
#     if goal_match:
#         parsed.update({
#             "Event_type": "Goal",
#             "Primary_player": clean_player_name(goal_match.group(1).replace(',', '').title())
#         })
#         return parsed

#     # Penalty parsing
#     penalty_match = re.search(
#         r"Penalty on\s+([A-Za-zÀ-ÖØ-öø-ÿ' ,.-]+)\s+([A-Z]{3,5})\s+(\d+)\s+minutes? for\s+(.+)",
#         desc
#     )
#     if penalty_match:
#         parsed.update({
#             "Event_type": "Penalty",
#             "Primary_player": clean_player_name(penalty_match.group(1).replace(',', '').title()),
#             "Primary_team": penalty_match.group(2),
#             "Penalty_duration": penalty_match.group(3),
#             "Penalty_type": penalty_match.group(4).strip()
#         })
#         return parsed

#     # Shot parsing
#     shot_match = re.search(
#         r"Shot by\s+([A-Z]{3,5})\s+([A-Za-zÀ-ÖØ-öø-ÿ' ,.-]+)\s+(MISSED|WIDE|BLOCKED|SAVE)",
#         desc
#     )
#     if shot_match:
#         parsed.update({
#             "Event_type": "Shot",
#             "Primary_team": shot_match.group(1),
#             "Primary_player": clean_player_name(shot_match.group(2).replace(',', '').title())
#         })
#         return parsed

#     return parsed

# def convert_to_continuous_time(period, time_str):
#     """Convert period and game clock to continuous seconds"""
#     period = int(period) if str(period).isdigit() else 4  # Handle OT as period 4
#     minutes, seconds = map(int, time_str.split(':'))
#     return (20 * 60 * (period - 1)) + ((20 - minutes) * 60) - seconds

# def process_game_data(json_path):
#     """Full processing pipeline"""
#     with open(json_path) as f:
#         data = json.load(f)
    
#     rows = []
#     for period in data['periods']:
#         period_num = period['periodNumber']
#         for play in period['playStats']:
#             description = play['visitorText'] or play['homeText'] or ''
#             rows.append({
#                 'Period': period_num,
#                 'Time': play['time'],
#                 'Description': description,
#                 'Score': play['score']
#             })
    
#     df = pd.DataFrame(rows)
    
#     # Convert time
#     df['Period'] = df['Period'].replace({'1st': '1', '2nd': '2', '3rd': '3', 'OT': '4'})
#     df['Continuous_Time'] = df.apply(
#         lambda x: convert_to_continuous_time(x['Period'], x['Time']), axis=1
#     )
    
#     # Parse descriptions
#     parsed_data = df['Description'].apply(parse_description)
#     parsed_df = pd.DataFrame(parsed_data.tolist())
    
#     # Combine data
#     final_df = pd.concat([df, parsed_df], axis=1)
#     return final_df.reset_index(drop=True)

###### Parsing

In [18]:
# def parse_description(description, team_mapping):
#     """Improved parser with team standardization and better pattern matching"""
#     # Standardize team names first
#     desc_clean = standardize_teams(description, team_mapping)
    
#     parsed = {
#         "Event_type": "Other",
#         "Primary_player": None,
#         "Primary_team": None,
#         "Secondary_player": None,
#         "Secondary_team": None,
#         "Outcome": None,
#         "Penalty_duration": None,
#         "Penalty_type": None
#     }

#     # Faceoff pattern (handles standardized team names)
#     if "faceoff" in desc_clean.lower():
#         faceoff_pattern = r"Faceoff (.+?) vs (.+?) won by (.+?)\."
#         if (match := re.search(faceoff_pattern, desc_clean, re.IGNORECASE)):
#             parsed.update({
#                 "Event_type": "Faceoff",
#                 "Primary_player": clean_player_name(match.group(1)),
#                 "Secondary_player": clean_player_name(match.group(2)),
#                 "Primary_team": match.group(3).strip(),
#                 "Outcome": "won"
#             })
#         return parsed

#     # Goal pattern (handles special characters)
#     if "goal by" in desc_clean.lower():
#         goal_pattern = r"Goal by (.+?) \("
#         if (match := re.search(goal_pattern, desc_clean)):
#             parsed.update({
#                 "Event_type": "Goal",
#                 "Primary_player": clean_player_name(match.group(1))
#             })
#         return parsed

#     # Penalty pattern (handles different formats)
#     if "penalty on" in desc_clean.lower():
#         penalty_pattern = r"Penalty on (.+?) (\w+) (\d+) minutes for (.+)"
#         if (match := re.search(penalty_pattern, desc_clean, re.IGNORECASE)):
#             parsed.update({
#                 "Event_type": "Penalty",
#                 "Primary_player": clean_player_name(match.group(1)),
#                 "Primary_team": match.group(2),
#                 "Penalty_duration": match.group(3),
#                 "Penalty_type": match.group(4).strip()
#             })
#         return parsed

#     # Shot pattern (handles shot types and goalie info)
#     if "shot by" in desc_clean.lower():
#         shot_pattern = r"Shot by (\w+) (.+?)(?:, save|$)"
#         if (match := re.search(shot_pattern, desc_clean, re.IGNORECASE)):
#             parsed.update({
#                 "Event_type": "Shot",
#                 "Primary_team": match.group(1),
#                 "Primary_player": clean_player_name(match.group(2))
#             })
#         return parsed

#     return parsed


In [19]:
# from unicodedata import normalize

# def clean_player_name(name):
#     """Handle international characters and formatting"""
#     if pd.isna(name): return None
#     return normalize('NFKD', name.strip()).encode('ascii', 'ignore').decode()


# # # Function to convert period and time to continuous time
# def convert_to_continuous_time(period, time):
#     period_offsets = {'1': 0, '2': 20, '3': 40, 'OT': 60}
#     minutes, seconds = map(int, time.split(':'))
#     elapsed_time = (20 - minutes) * 60 + -seconds
#     offset = period_offsets.get(period, 0) * 60
#     return offset + elapsed_time


In [20]:
# # Transform pipeline
# def transform_game_data(json_data, game_id, team_mapping):
#     # Initial dataframe creation
#     game_df = pd.DataFrame([
#         {
#             'Game_ID': game_id,
#             'Period': period['periodNumber'],
#             'Time': play['time'],
#             'Description': play['visitorText'] or play['homeText'],
#             'Score': play['score']
#         }
#         for period in json_data['periods']
#         for play in period['playStats']
#     ])
    
#     # Add continuous time
#     game_df['Continuous_Time'] = game_df.apply(
#         lambda r: convert_to_continuous_time(r['Period'], r['Time']), 
#         axis=1
#     )
    
#     # Parse descriptions with team standardization
#     parsed_data = game_df['Description'].apply(
#         lambda d: parse_description(d, team_mapping)
#     )
#     return pd.concat([game_df, pd.DataFrame(parsed_data.tolist())], axis=1)


In [21]:
# # Load your JSON data
# with open('../TEMP/play_by_play.json') as f:
#     game_data = json.load(f)

# # Process the data
# df = transform_game_data(game_data, "2024_LSSU_vs_MICHST", team_mapping)


In [22]:
# school_info_df.head()
# Create a dictionary mapping the alternate names (ncaa_data_alts) to the standardized names


In [23]:
updated_schedule_df

# Output Play by Play JSON to a raw text file
# Function to save JSON data to a text file
def save_json_to_file(json_data, file_path):
    with open(file_path, 'w') as file:
        json.dump(json_data, file)

## Call the function for the first row
save_json_to_file(updated_schedule_df.iloc[0]['Play_By_Play_JSON'], os.path.join(temp_folder, 'play_by_play.json'))


#### ChatGPT attempt at parsing

##### Create team Map for name subsitutions

In [24]:
# # Create a mapping of standardized team names to their alternative names
# team_map = {
#     row['Team']: row['ncaa_data_alts']
#     for _, row in school_info_df.iterrows() if not pd.isnull(row['ncaa_data_alts'])
# }

# # Standardize the format of the mapping to make it usable in regex substitutions
# for key in team_map:
#     team_map[key] = team_map[key].replace(', ', '|')

# # Display the resulting team mapping
# team_map


In [25]:
# # Final correction: Preprocess multi-word team names globally before tokenization
# def standardize_team_names_corrected(description, team_map):
#     """
#     Replace alternate team names in a description with preprocessing for multi-word names.
#     Ensures consistent handling of multi-word abbreviations before tokenized adjustments.
#     """
#     # Preprocess multi-word team names first
#     for std_name, alt_names in team_map.items():
#         description = re.sub(rf'\b({alt_names})\b', std_name, description, flags=re.IGNORECASE)

#     # Tokenized replacement for any remaining cases (if needed)
#     tokens = description.split()
#     for i, token in enumerate(tokens):
#         for std_name, alt_names in team_map.items():
#             if re.fullmatch(rf'({alt_names})', token.strip('.,'), flags=re.IGNORECASE):
#                 tokens[i] = std_name
#                 break
#     return " ".join(tokens)

# # Apply the corrected function to the problematic test case
# # Test on a 

In [26]:
import pandas as pd
import re
import unicodedata

# Function to convert period and time to continuous time
def convert_to_continuous_time(period, time):
    period_offsets = {'1': 0, '2': 20, '3': 40, 'OT': 60}
    minutes, seconds = map(int, time.split(':'))
    elapsed_time = (20 - minutes) * 60 + -seconds
    offset = period_offsets.get(period, 0) * 60
    return offset + elapsed_time

# Function to normalize names to handle accents and special characters
def normalize_name(name):
    if not name:
        return None
    # Normalize Unicode accents and remove non-ASCII characters
    normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    return normalized

# Enhanced player name formatting function
def clean_player_name(name):
    """
    Converts a name from "Last, First" to "First Last", handling punctuation and normalization.
    Example: "Hughes, T.J." -> "T.J. Hughes"
    """
    if not name:
        return None
    name = normalize_name(name)
    parts = [p.strip() for p in name.split(',')]
    if len(parts) == 2:
        last, first = parts
        return f"{first} {last}".strip()
    return name

# Function to parse play-by-play descriptions
# Improved to handle team abbreviations and player names with issues
def parse_description(description):
    """
    Parse a single play-by-play description into structured fields.
    """
    desc_lower = description.lower().strip()
    parsed = {
        "Event_type": "Other",
        "Primary_player": None,
        "Primary_team": None,
        "Secondary_player": None,
        "Secondary_team": None,
        "Outcome": None,
    }

    # Normalize known team abbreviations
    team_map = {
        'michst': 'MICHST',
        'lake sup': 'LK SUP',
        'lk sup': 'LK SUP',
        'michigan state': 'MICHST',
        'lake superior': 'LK SUP'
    }

    for key, value in team_map.items():
        desc_lower = desc_lower.replace(key, value.lower())

    # --- Faceoff ---
    if "faceoff" in desc_lower:
        parsed["Event_type"] = "Faceoff"
        faceoff_pattern = (
            r"Faceoff\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"vs\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"won by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\."
        )
        match = re.search(faceoff_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Secondary_player"] = clean_player_name(match.group(2))
            parsed["Primary_team"] = match.group(3).strip()
            parsed["Outcome"] = "won"
        return parsed

    # --- Goal ---
    if "goal by" in desc_lower:
        parsed["Event_type"] = "Goal"
        goal_scorer_pattern = r"Goal by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)"
        match = re.search(goal_scorer_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
        return parsed

    # --- Penalty ---
    if desc_lower.startswith("penalty on"):
        parsed["Event_type"] = "Penalty"
        penalty_pattern = (
            r"Penalty on\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-ÿ'\.\- ]+)\s+"
            r"([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(\d+) minutes for (.+)"
        )
        match = re.search(penalty_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
            parsed["Penalty_duration"] = match.group(3).strip()
            parsed["Penalty_type"] = match.group(4).strip()
        return parsed

    # --- Shot ---
    if "shot by" in desc_lower:
        parsed["Event_type"] = "Shot"
        shot_pattern = r"Shot by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(.+)"
        match = re.search(shot_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
        return parsed

    return parsed

# Function to transform a single game's JSON data into a dataframe
def transform_single_game(json_data, game_id):
    rows = []

    for period in json_data['periods']:
        period_number = period['periodNumber']
        for play in period['playStats']:
            row = {
                'Game_ID': game_id,
                'Period': period_number,
                'Time': play['time'],
                'Description': play['visitorText'] or play['homeText'],
                'Score': play['score']
            }
            rows.append(row)

    game_df = pd.DataFrame(rows)

    # Convert period and time to continuous time
    game_df['Period'] = game_df['Period'].replace({'1st': '1', '2nd': '2', '3rd': '3', 'OT': 'OT'})
    game_df['Time'] = game_df.apply(lambda row: convert_to_continuous_time(row['Period'], row['Time']), axis=1)

    # Parse descriptions
    parsed_descriptions = game_df['Description'].apply(parse_description)
    parsed_df = pd.DataFrame(parsed_descriptions.tolist())

    # Combine with original game_df
    return pd.concat([game_df, parsed_df], axis=1)

# Function to process all games and combine into a single dataframe
def process_all_games(schedule_df):
    all_games = []

    for _, row in schedule_df.iterrows():
        game_id = row['Game_ID']
        json_data = row['Play_By_Play_JSON']

        if json_data:
            game_df = transform_single_game(json_data, game_id)
            all_games.append(game_df)

    return pd.concat(all_games, ignore_index=True)

# Example usage
# Assuming `updated_schedule_df` is the dataframe containing the JSON play-by-play data
final_pbp_df = process_all_games(updated_schedule_df)

# Display the resulting dataframe
# final_pbp_df.head(12)

In [27]:
## look at tail
# final_pbp_df.tail(12)

In [28]:
# Function to handle 'SAVE' case
def move_save(row):
    if pd.notnull(row['Primary_team']) and ', save' in row['Primary_team']:
        row['Secondary_player'] = row['Primary_team'].split(', save')[1].strip()
        row['Primary_team'] = row['Primary_team'].split(', save')[0].strip()


        row['Primary_team'] = row['Primary_team'].replace(', save', '').strip()
    return row

# Function to handle 'BLOCKED' case
def move_blocked(row):
    if pd.notnull(row['Primary_team']) and 'BLOCKED' in row['Primary_team']:
        blocked_match = re.search(r'BLOCKED by (.+)', row['Primary_team'])
        if blocked_match:
            row['Secondary_player'] = blocked_match.group(1).strip()
            row['Primary_team'] = re.sub(r'BLOCKED by .+', 'BLOCKED', row['Primary_team']).strip()
    return row

# Function to extract and move the outcome to 'Outcome' column
def move_outcome(row):
    if pd.notnull(row['Primary_team']):
        outcome_match = re.search(r'\b(MISSED|WIDE|BLOCKED|SAVE)\b', row['Primary_team'])
        if outcome_match:
            row['Outcome'] = outcome_match.group(1)
            row['Primary_team'] = re.sub(r'\b(MISSED|WIDE|BLOCKED|SAVE)\b', '', row['Primary_team']).strip()
    return row

# Apply transformations sequentially
final_pbp_df = final_pbp_df.apply(move_save, axis=1)
final_pbp_df = final_pbp_df.apply(move_blocked, axis=1)
final_pbp_df = final_pbp_df.apply(move_outcome, axis=1)

# # Display the first few rows of the cleaned dataframe


# # # Notes for second step of transformation
# # # Faceoff Seem to be working as intended
# # # Goal - Primary_player is working as intended
# #     # - Primary Team is not being captured - probably because the team name is used and not the abbreviation
# # # Shots - Primary_player actually contains the team abbreviation
# #     # - Primary_team includes the player name and still includes the shot outcome WIDE, BLOCKED, MISSED in the
# #     #  - UPPER CASE - Need to remove the outcome from the team name and move to outcome column
# #     #  - MISSED IS THE SAME AS SAVED - also includes the goalie name after the outcome
# #     #  - BLOCKED also includes the secondary player name after the outcome

In [29]:
### Next Steps 
# For all Event_type: Shot swap the Primary_player and Primary_team values

# Function to swap 'Primary_player' and 'Primary_team' for 'Shot' events
def swap_shot_columns(row):
    if row['Event_type'] == 'Shot':
        row['Primary_player'], row['Primary_team'] = row['Primary_team'], row['Primary_player']
    return row

# Apply the function to the dataframe
final_pbp_df = final_pbp_df.apply(swap_shot_columns, axis=1)





In [30]:
## Need to deal with foriegn names like Tommi Mannisto (which has accents and appears like MÃ£Â„nnistÃ£Â–, Tommi. in the data

def fix_encoding_issues(df, columns):
    """
    Fix encoding issues in specified columns of a dataframe.
    
    Args:
        df (pd.DataFrame): The dataframe containing columns with text issues.
        columns (list): List of column names to fix.
        
    Returns:
        pd.DataFrame: The dataframe with fixed text in specified columns.
    """
    def decode_text(text):
        try:
            # Decode from 'latin1' and re-encode to 'utf-8'
            return text.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError, AttributeError):
            # Return text as is if decoding fails
            return text
    
    for col in columns:
        df[col] = df[col].apply(decode_text)
    
    return df

# Apply the function to the dataframe
columns_to_fix = ['Primary_player', 'Secondary_player']

final_pbp_df = fix_encoding_issues(final_pbp_df, columns_to_fix)# Display the first few rows of the cleaned dataframe
# final_pbp_df.head(12)



In [None]:
def standardize_names(df, columns):
    """
    Standardize player names in the specified columns to 'First Last' format.
    
    Args:
        df (pd.DataFrame): The dataframe containing player name columns.
        columns (list): List of column names to standardize.
        
    Returns:
        pd.DataFrame: The dataframe with standardized player names.
    """
    def fix_name_format(name):
        if pd.isnull(name):  # Handle missing values
            return name
        name = name.replace(".", "")  # Remove periods
        if "," in name:  # If the name is in 'Last, First' format
            parts = name.split(",")
            return f"{parts[1].strip()} {parts[0].strip()}"  # Rearrange to 'First Last'
        return name.strip()  # Return as is if already in 'First Last' format

    for col in columns:
        df[col] = df[col].apply(fix_name_format)
    
    return df

# Apply the function to the dataframe
final_pbp_df = standardize_names(final_pbp_df, columns_to_fix)
# Display the first few rows of the cleaned dataframe
final_pbp_df.head(12)

In [32]:
# def parse_goal_events(df):
#     """
#     Parse 'Goal' event descriptions to extract Primary_player (goal scorer)
#     and Primary_team (team that scored) based on the "On ice for" section.
    
#     Args:
#         df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', 'Primary_player', and 'Primary_team' columns.
    
#     Returns:
#         pd.DataFrame: The updated dataframe with parsed Primary_player and Primary_team.
#     """
#     def extract_goal_info(row):
#         if row['Event_type'] == 'Goal':  # Only process rows with Event_type 'Goal'
#             description = row['Description']
            
#             # Extract the goal scorer (Primary_player) in "Last, First" format
#             goal_match = re.search(r"Goal by (.+?) \(", description)
#             if goal_match:
#                 last_first_name = goal_match.group(1).strip()
#                 # Convert "Last, First" to "First Last"
#                 parts = last_first_name.split(", ")
#                 first_last_name = f"{parts[1]} {parts[0]}" if len(parts) == 2 else last_first_name
#                 row['Primary_player'] = first_last_name
            
#             # Extract teams and players under "On ice for"
#             on_ice_sections = re.findall(r"On ice for ([A-Z ]+): (.+?)(?=(?:[A-Z ]+:|$))", description)
#             if on_ice_sections:
#                 for team, players in on_ice_sections:
#                     player_list = [player.strip() for player in players.split(";")]
#                     if row['Primary_player'] in player_list:
#                         row['Primary_team'] = team.strip()  # Assign team where goal scorer is listed
#                         return row  # Exit once the correct team is found
            
#             # If no match is found, leave Primary_team as blank
#             row['Primary_team'] = None
#         return row

#     # Apply the extraction function row-wise
#     df = df.apply(extract_goal_info, axis=1)
#     return df

# # Apply the function to parse goal events
# final_pbp_df = parse_goal_events(final_pbp_df)



In [33]:
# import pandas as pd
# import re

# def parse_goal_events_with_inferred_away_team(df):
#     """
#     Parse 'Goal' event descriptions to extract Primary_player (goal scorer)
#     and Primary_team (team that scored), inferring the Away_team from the Game_ID.
    
#     Args:
#         df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', 'Primary_player', 
#                            'Primary_team', and 'Game_ID' columns.
    
#     Returns:
#         pd.DataFrame: The updated dataframe with parsed Primary_player and Primary_team.
#     """
#     def extract_goal_info(row):
#         if row['Event_type'] == 'Goal':  # Only process rows with Event_type 'Goal'
#             description = row['Description']
            
#             # Extract the goal scorer (Primary_player) in "Last, First" format
#             goal_match = re.search(r"Goal by (.+?) \(", description)
#             if goal_match:
#                 last_first_name = goal_match.group(1).strip()
#                 # Convert "Last, First" to "First Last"
#                 parts = last_first_name.split(", ")
#                 first_last_name = f"{parts[1]} {parts[0]}" if len(parts) == 2 else last_first_name
#                 row['Primary_player'] = first_last_name
            
#             # Extract teams and players under "On ice for"
#             on_ice_sections = re.findall(r"On ice for ([A-Z ]+): (.+?)(?=(?:[A-Z ]+:|$))", description)
#             if on_ice_sections:
#                 for team, players in on_ice_sections:
#                     player_list = [player.strip() for player in players.split(";")]
#                     if row['Primary_player'] in player_list:
#                         row['Primary_team'] = team.strip()  # Assign team where goal scorer is listed
#                         break
        
#         return row

#     # Apply the extraction function row-wise to parse goals
#     df = df.apply(extract_goal_info, axis=1)

#     # Infer Away_team from Game_ID
#     def infer_away_team(game_id):
#         try:
#             return game_id.split("-")[3]
#         except IndexError:
#             return None

#     df['Away_team'] = df['Game_ID'].apply(infer_away_team)

#     # Fill missing Primary_team values with the inferred Away_team
#     df['Primary_team'] = df['Primary_team'].fillna(df['Away_team'])
    
#     # Drop the temporary 'Away_team' column if it’s no longer needed
#     df = df.drop(columns=['Away_team'])
    
#     return df

# # Apply the function to parse goal events and infer away team
# final_pbp_df = parse_goal_events_with_inferred_away_team(final_pbp_df)


In [34]:
# import pandas as pd
# import re

# def parse_goal_events_with_fill(df):
#     """
#     Parse 'Goal' event descriptions to extract Primary_player (goal scorer)
#     and Primary_team (team that scored), filling missing Primary_team values with the away team.
    
#     Args:
#         df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', 'Primary_player', 
#                            'Primary_team', and 'Home_team'/'Away_team' columns.
    
#     Returns:
#         pd.DataFrame: The updated dataframe with parsed Primary_player and Primary_team.
#     """
#     def extract_goal_info(row):
#         if row['Event_type'] == 'Goal':  # Only process rows with Event_type 'Goal'
#             description = row['Description']
            
#             # Extract the goal scorer (Primary_player) in "Last, First" format
#             goal_match = re.search(r"Goal by (.+?) \(", description)
#             if goal_match:
#                 last_first_name = goal_match.group(1).strip()
#                 # Convert "Last, First" to "First Last"
#                 parts = last_first_name.split(", ")
#                 first_last_name = f"{parts[1]} {parts[0]}" if len(parts) == 2 else last_first_name
#                 row['Primary_player'] = first_last_name
            
#             # Extract teams and players under "On ice for"
#             on_ice_sections = re.findall(r"On ice for ([A-Z ]+): (.+?)(?=(?:[A-Z ]+:|$))", description)
#             if on_ice_sections:
#                 for team, players in on_ice_sections:
#                     player_list = [player.strip() for player in players.split(";")]
#                     if row['Primary_player'] in player_list:
#                         row['Primary_team'] = team.strip()  # Assign team where goal scorer is listed
#                         break
#         return row

#     # Apply the extraction function row-wise to parse goals
#     df = df.apply(extract_goal_info, axis=1)

#     # Fill missing Primary_team values with the Away_team
#     df['Primary_team'] = df['Primary_team'].fillna(df['Away_team'])
    
#     return df

# # Apply the function to parse goal events and fill missing values
# final_pbp_df = parse_goal_events_with_fill(final_pbp_df)

In [35]:
import pandas as pd
import re

def classify_power_play_events(df):
    """
    Classify 'Other' Event_type as 'PP - Start' or 'PP - End' based on the Description,
    and extract the team abbreviation to the Primary_team column.
    
    Args:
        df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', and 'Primary_team' columns.
        
    Returns:
        pd.DataFrame: The updated dataframe with classified 'Event_type' and filled 'Primary_team'.
    """
    def classify_event(row):
        if row['Event_type'] == 'Other':
            description = row['Description']
            # Check for "Start power play for"
            if re.search(r"Start power play for", description):
                row['Event_type'] = 'PP - Start'
                row['Primary_team'] = description.split('for')[-1].strip().rstrip('.')
            # Check for "End power play for"
            elif re.search(r"End power play for", description):
                row['Event_type'] = 'PP - End'
                row['Primary_team'] = description.split('for')[-1].strip().rstrip('.')
        return row

    # Apply the classification function row-wise
    df = df.apply(classify_event, axis=1)
    return df

# Apply the function to classify power play events
final_pbp_df = classify_power_play_events(final_pbp_df)

# Display the first few rows of the updated dataframe
# final_pbp_df.head(22)



In [None]:
# Examine the data
final_pbp_df.info()
# Value counts
# final_pbp_df['Event_type'].value_counts()

##### Deal with Goalie change / info rows

In [37]:
import pandas as pd
import re

def classify_goalie_moves(df):
    """
    Parse goalie moves from the Description column and classify them as 'Goalie Move'.
    Extract the goalie name as Primary_player and the team name as Primary_team.
    
    Args:
        df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', 'Primary_player', and 'Primary_team' columns.
        
    Returns:
        pd.DataFrame: The updated dataframe with classified 'Event_type', 'Primary_player', and 'Primary_team'.
    """
    def parse_goalie_move(row):
        if row['Event_type'] == 'Other':  # Only process rows marked as 'Other'
            description = row['Description']
            # Look for patterns like 'Name at goalie for Team'
            match = re.match(r"(.+?) at goalie for (.+?)\.", description)
            if match:
                row['Event_type'] = 'Goalie Move'
                row['Primary_player'] = match.group(1).strip()  # Extract the goalie's name
                row['Primary_team'] = match.group(2).strip()  # Extract the team name
        return row

    # Apply the parsing function row-wise
    df = df.apply(parse_goalie_move, axis=1)
    return df

# Apply the function to classify goalie moves
final_pbp_df = classify_goalie_moves(final_pbp_df)



In [38]:
## In the Outcome column relabel MISSED to SAVED for clairty
def relabel_missed_to_saved(df):
    """
    Relabel 'MISSED' to 'SAVED' in the Outcome column.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Outcome column.
        
    Returns:
        pd.DataFrame: The updated dataframe with relabeled outcomes.
    """
    df['Outcome'] = df['Outcome'].replace('MISSED', 'SAVED')
    return df

# Apply the relabeling function
final_pbp_df = relabel_missed_to_saved(final_pbp_df)

In [39]:
# Clean the Penalty_type column to remove periods and any leading/trailing whitespace
def clean_penalty_type(df):
    """
    Clean the Penalty_type column by removing periods and extra whitespace.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Penalty_type column.
        
    Returns:
        pd.DataFrame: The updated dataframe with cleaned Penalty_type.
    """
    df['Penalty_type'] = df['Penalty_type'].str.replace('.', '', regex=False).str.strip()
    return df

# Apply the cleaning function
final_pbp_df = clean_penalty_type(final_pbp_df)

In [None]:
## Show 5 goal events
final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(5)


## Start Wed 1-29
- clean the goal events

In [41]:
### Grab the Goal Conditions within the parenthesis in the definition
### Put in a new column called Goal_Conditions
def extract_goal_conditions(df):
    """
    Extract goal conditions from the Description column and add them to a new column called Goal_Conditions.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Description column.
        
    Returns:
        pd.DataFrame: The updated dataframe with the Goal_Conditions column.
    """
    def extract_conditions(description):
        match = re.search(r'\((.*?)\)', description)
        return match.group(1) if match else None

    df['Goal_Conditions'] = df['Description'].apply(extract_conditions)
    return df

# Apply the extraction function
final_pbp_df = extract_goal_conditions(final_pbp_df)

In [None]:
# Filter to just Goal events to check the Goal_Conditions column
final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(12)




In [43]:
def assign_primary_team(df):
    """
    Assigns the correct 'Primary_team' to goal events by analyzing score changes.
    """
    # Extract away and home team names from Game_ID
    def extract_teams(game_id):
        parts = game_id.split('-')
        away_team = parts[3]
        home_team = parts[4]
        return away_team, home_team

    # Filter only goal events
    goal_df = df[df['Event_type'] == 'Goal'].copy()

    # Sort by game and time sequence
    goal_df.sort_values(by=['Game_ID', 'Period', 'Time'], inplace=True)

    # Initialize previous scores dictionary
    prev_scores = {}

    # Iterate over goal events
    for idx, row in goal_df.iterrows():
        game_id = row['Game_ID']
        score_str = row['Score']
        
        if pd.isna(score_str):
            continue

        # Parse score into integers
        away_score, home_score = map(int, score_str.split('-'))

        # Extract teams
        away_team, home_team = extract_teams(game_id)

        # Check previous score to determine which team scored
        if game_id in prev_scores:
            prev_away, prev_home = prev_scores[game_id]

            if away_score > prev_away:
                goal_df.at[idx, 'Primary_team'] = away_team
            elif home_score > prev_home:
                goal_df.at[idx, 'Primary_team'] = home_team
        else:
            # First goal of the game, determine scorer by score value
            if away_score > home_score:
                goal_df.at[idx, 'Primary_team'] = away_team
            else:
                goal_df.at[idx, 'Primary_team'] = home_team

        # Update previous score
        prev_scores[game_id] = (away_score, home_score)

    # Merge updated Primary_team back into original dataframe
    df.update(goal_df[['Primary_team']])
    return df

# Apply function to the dataframe
final_pbp_df = assign_primary_team(final_pbp_df)

In [44]:
### Not working correctly, only getting values when home team scores
# def transform_primary_player(name):
#     """Convert 'Last, First' to 'First Last'."""
#     if pd.isna(name) or ',' not in name:
#         return name
#     last, first = name.split(', ', 1)
#     return f"{first} {last}"

# def extract_correct_primary_team_v2(description, primary_player):
#     """
#     Improved method to correctly extract the goal-scoring team abbreviation.
#     - Finds where the primary player's name appears.
#     - Identifies the corresponding team abbreviation.
#     - Ensures it works for both home and away teams.
#     """
#     if pd.isna(description) or pd.isna(primary_player):
#         return None

#     # # Convert primary player format for matching
#     # primary_player = transform_primary_player(primary_player)

#     # Find all team sections in the format "TEAM: player1; player2; ..."
#     team_sections = re.findall(r'([A-Z\s]+):\s([^:]+)', description)

#     # Iterate over both teams
#     for team_abbr, player_list in team_sections:
#         team_abbr = team_abbr.strip()

#         # Check if the goal scorer's name is in this team's player list
#         player_names = [transform_primary_player(name.strip()) for name in player_list.split(';')]
#         if primary_player in player_names:
#             return team_abbr

#     return None

# # Call the function to extract the correct primary team
# final_pbp_df['Primary_team'] = final_pbp_df.apply(
#     lambda row: extract_correct_primary_team_v2(row['Description'], row['Primary_player']), axis=1
# )

# # Display the first few rows of the updated dataframe
# final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(12)


In [None]:
final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(15)

In [48]:
#### TEAM_MAP is in memory (created earlier from school_info_df)
### This block and be used to add additional team mappings before doing the substitution

### NEED TO ADDRESS MINNESOTA DUTITH, Amerrican International, St Lawerence - THE PARSING IS COMPLETELY FAILING IN AT LEAST SOME OF THEIR GAME
#### ST is also a mess with mutiple teams . Alas is also showing up for bothe Alaska teams
# add 'michigan st': 'Michigan State', to the team_map

team_map['michigan st'] = 'Michigan State'
team_map['linwod'] = 'Lindenwood'
team_map['sup'] = 'Lake Superior'
team_map['afa'] = 'Air Force'
team_map['anc'] = 'Alaska Anchorage'
team_map['asu'] = 'Arizona State'
team_map['aug'] = 'Augustana'
team_map['ben'] = 'Bentley'
team_map['bgsu santa'] = 'Bowling Green'
team_map['brown st.'] = 'Brown'
team_map['can'] = 'Canisius'
team_map['clk'] = 'Clarkson'
team_map['col'] = 'Colgate'
team_map['dak'] = 'North Dakota'
team_map['dame'] = 'Notre Dame'
team_map['den'] = 'Denver'
team_map['dul'] = 'Minnesota Duluth'
team_map['fsu'] = 'Ferris State'
team_map['har'] = 'Harvard'
team_map['har st.'] = 'Harvard'
team_map['int'] = 'American Intl'
team_map['lin'] = 'Lindenwood'
team_map['lwu'] = 'Lindenwood'
team_map['mia'] = 'Maine'
team_map['michst a'] = 'Michigan State'
team_map['minn pa'] = 'Minnesota'
team_map['neu'] = 'Northeastern'
team_map['no dak jamernik'] = 'North Dakota'
team_map['oh'] = 'Miami'
team_map['omaha van'] = 'Omaha'
team_map['pri'] = 'Princeton'
team_map['pri de la'] = 'Princeton'
team_map['prince de la'] = 'Princeton'
team_map['qui'] = 'Quinnipiac'
team_map['scs'] = 'St. Cloud State'
team_map['sd'] = 'Augustana'
team_map['shu'] = 'Sacred Heart'
team_map['slu'] = 'St. Lawrence'
team_map['stc'] = 'Stonehill'
team_map['sup'] = 'Lake Superior'
team_map['u-m'] = 'Michigan'
team_map['uma'] = 'Massachusetts'
team_map['umd'] = 'Minnesota Duluth'
team_map['und'] = 'Notre Dame'
team_map['uni'] = 'Union'
team_map['ust'] = 'St. Thomas'
team_map['vermnt la'] = 'Vermont'
team_map['wis'] = 'Wisconsin'
team_map['wmu'] = 'Western Michigan'
team_map['yal'] = 'Yale'











# team_map

In [49]:
def standardize_primary_team(df, team_map):
    """
    Standardizes the 'Primary_team' column using the provided team_map.
    """
    # Convert to lowercase and map to standardized names
    df['Primary_team'] = df['Primary_team'].str.lower().map(team_map).fillna(df['Primary_team'])

    return df

# Run the function to standardize the 'Primary_team' column
final_pbp_df = standardize_primary_team(final_pbp_df, team_map)

In [None]:
final_pbp_df.sample(15)

In [None]:
## Save the dataframe to a CSV file
final_pbp_df.to_csv(os.path.join(temp_folder, 'pbp_data_test_3.2.csv'), index=False)

In [47]:
### Save Updated Schedule DF (With PbP JSONs) to csv to avoid scraping for new tests

updated_schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_PbP_JSON.csv'), index=False)
