# NCAA.com Play-by-play Data Scraper
-

In [21]:
## Set File Name and data
output_filename = 'pbp_data_2025_Mar_17.csv'

# example schedule URL
## First Day of season
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2024/10/04/all-conf

# Last Regular Season Day
# https://www.ncaa.com/scoreboard/icehockey-men/d1/2025/03/08/all-conf



In [22]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import os
import re
import json

from config import recent_clean_db, last_game_date

# File paths
data_folder = os.path.join('..', 'data/') # Data Folder Path
temp_folder = os.path.join('..', 'TEMP/',) # Temp Folder Path



In [23]:
# schedule_df.head()

## Scrape the NCAA.com schedule section
- Will Use Same code to extract complete past season's data
    - DATA GEOS BACK TO the 2017 Tournement (March)
    - FULL SEASON DATA AVAILABLE STARTING WITH 2017-18 (Sept 30 is first game of season)

- Creates a dataframe with Data - Teams and Game_ID_Number

- Turned off because it takes 6-7 minutes to run and we can use a previously scraped and locally stored schedule



In [24]:


# Base URL for NCAA schedule
base_url = "https://www.ncaa.com/scoreboard/icehockey-men/d1"

# Function to scrape a single day's schedule with rate limiting
def scrape_schedule(date):
    url = f"{base_url}/{date}/all-conf"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch data for {date}: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    games = []

    # Locate game containers based on the provided HTML structure
    game_containers = soup.select('#scoreboardGames .gamePod')
    for game in game_containers:
        try:
            game_id = game.select_one('a.gamePod-link')['href'].split('/')[-1]
            teams = game.select('ul.gamePod-game-teams li')
            
            home_team = teams[0].select_one('span.gamePod-game-team-name').text.strip()
            away_team = teams[1].select_one('span.gamePod-game-team-name').text.strip()
            
            games.append({
                'Date': date,
                'Home_Team': home_team,
                'Away_Team': away_team,            

                'game_id_number': game_id
            })
        except Exception as e:
            print(f"Error processing game: {e}")

    return games

# Function to scrape a range of dates with rate limiting and progress bar
def scrape_schedule_range(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y/%m/%d')
    all_games = []
    
    # Progress bar setup
    for date in tqdm(date_range, desc="Scraping schedule", unit="day"):
        games = scrape_schedule(date)
        all_games.extend(games)
        time.sleep(1)  # Rate limiter: 1-second delay between requests

    return pd.DataFrame(all_games)

# Example usage
start_date = "2024-09-20"  # First day of the season
end_date = "2025-03-20"    # Last regular season day
schedule_df = scrape_schedule_range(start_date, end_date)

# Display the resulting dataframe
schedule_df

Scraping schedule:   0%|          | 0/182 [00:00<?, ?day/s]

Failed to fetch data for 2024/09/20: 404


Scraping schedule:   1%|          | 1/182 [00:03<11:22,  3.77s/day]

Failed to fetch data for 2024/09/21: 404


Scraping schedule:   1%|          | 2/182 [00:05<08:07,  2.71s/day]

Failed to fetch data for 2024/09/22: 404


Scraping schedule:   2%|▏         | 3/182 [00:07<07:03,  2.36s/day]

Failed to fetch data for 2024/09/23: 404


Scraping schedule:   2%|▏         | 4/182 [00:10<07:06,  2.40s/day]

Failed to fetch data for 2024/09/24: 404


Scraping schedule:   3%|▎         | 5/182 [00:12<06:46,  2.30s/day]

Failed to fetch data for 2024/09/25: 404


Scraping schedule:   3%|▎         | 6/182 [00:14<06:23,  2.18s/day]

Failed to fetch data for 2024/09/26: 404


Scraping schedule:   4%|▍         | 7/182 [00:16<06:07,  2.10s/day]

Failed to fetch data for 2024/09/27: 404


Scraping schedule:   4%|▍         | 8/182 [00:18<05:58,  2.06s/day]

Failed to fetch data for 2024/09/28: 404


Scraping schedule:   5%|▍         | 9/182 [00:20<05:49,  2.02s/day]

Failed to fetch data for 2024/09/29: 404


Scraping schedule:   5%|▌         | 10/182 [00:22<05:48,  2.03s/day]

Failed to fetch data for 2024/09/30: 404


Scraping schedule: 100%|██████████| 182/182 [07:20<00:00,  2.42s/day]


Unnamed: 0,Date,Home_Team,Away_Team,game_id_number
0,2024/10/04,Michigan St.,Lake Superior St.,6344272
1,2024/10/04,Minnesota St.,Michigan,6344249
2,2024/10/04,Bowling Green,Mercyhurst,6344336
3,2024/10/04,Colgate,UConn,6344337
4,2024/10/04,Miami (OH),Ferris St.,6344354
...,...,...,...,...
1152,2025/03/17,Michigan Tech,Bowling Green,6442975
1153,2025/03/17,Ferris St.,St. Thomas (MN),6442977
1154,2025/03/17,Bemidji St.,Augustana (SD),6442980
1155,2025/03/20,UConn,Boston U.,6443715


### Save / Load Local Copy of Schedule

In [25]:
### Save the schedule to a CSV file for later use
# schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'), index=False)
schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number_2025.csv'), index=False)

# Load the locally stored schedule to avoid having to scrape again
# schedule_df = pd.read_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_game_number.csv'))


In [26]:
schedule_df.tail(20)

Unnamed: 0,Date,Home_Team,Away_Team,game_id_number
1137,2025/03/15,Notre Dame,Michigan St.,6442668
1138,2025/03/15,UMass Lowell,Maine,6443718
1139,2025/03/15,St. Cloud St.,Western Mich.,6444069
1140,2025/03/15,Cornell,Colgate,6443575
1141,2025/03/15,Army West Point,Holy Cross,6443761
1142,2025/03/15,Northeastern,Boston College,6443719
1143,2025/03/15,Bemidji St.,Minnesota St.,6442967
1144,2025/03/15,North Dakota,Omaha,6444072
1145,2025/03/15,Colorado Col.,Denver,6444075
1146,2025/03/15,Penn St.,Ohio St.,6442669


### Data Transgformation
- NOT NESS IF WORKING WITH NEW SCRAPE 
    - Seperate team column into Home_Team, Away_Team

In [27]:
# Seperate team column into Home_Team, Away_Team

def handle_home_away(schedule_df):
    # Split Home_Team_Away_Team into Home_Team and Away_Team
    # schedule_df[['Away_Team', 'Home_Team']] = schedule_df['Home_Team_Away_Team'].str.split(' vs ', expand=True)
    
    # Remove punctuation and strip whitespace
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: re.sub(r'[^\w\s]', '', x).strip())

    # Drop the original column
    # schedule_df = schedule_df.drop(columns=['Home_Team_Away_Team'])
    return schedule_df

# call the function
schedule_df = handle_home_away(schedule_df)
schedule_df.head(20)

Unnamed: 0,Date,Home_Team,Away_Team,game_id_number
0,2024/10/04,Michigan St,Lake Superior St,6344272
1,2024/10/04,Minnesota St,Michigan,6344249
2,2024/10/04,Bowling Green,Mercyhurst,6344336
3,2024/10/04,Colgate,UConn,6344337
4,2024/10/04,Miami OH,Ferris St,6344354
5,2024/10/04,Arizona St,Air Force,6344335
6,2024/10/05,Michigan St,Lake Superior St,6344273
7,2024/10/05,Minnesota St,Michigan,6344250
8,2024/10/05,Bemidji St,Minn Duluth,6344341
9,2024/10/05,Massachusetts,Bentley,6344346


### Load School info and replace ncaa_names with standard Team names from existing data

In [28]:
# Load School info 
school_info_path = os.path.join(data_folder, 'arena_school_info.csv')
school_info_df = pd.read_csv(school_info_path)
# school_info_df.head() # Check data

# Function to map team names to standardized names
def map_team_names(schedule_df, school_info_df):
    # Create a mapping dictionary from school_info_df
    team_mapping = {
        re.sub(r'[^\w\s]', '', row['ncaa_name']).strip(): row['Team']
        for _, row in school_info_df.iterrows()
    }

    # Map Home_Team and Away_Team to standardized names
    schedule_df['Home_Team'] = schedule_df['Home_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))
    schedule_df['Away_Team'] = schedule_df['Away_Team'].apply(lambda x: team_mapping.get(re.sub(r'[^\w\s]', '', x).strip(), x))

    return schedule_df

# Call the function
schedule_df = map_team_names(schedule_df, school_info_df)

# Check the data
schedule_df.head(20)


Unnamed: 0,Date,Home_Team,Away_Team,game_id_number
0,2024/10/04,Michigan State,Lake Superior,6344272
1,2024/10/04,Minnesota State,Michigan,6344249
2,2024/10/04,Bowling Green,Mercyhurst,6344336
3,2024/10/04,Colgate,Connecticut,6344337
4,2024/10/04,Miami,Ferris State,6344354
5,2024/10/04,Arizona State,Air Force,6344335
6,2024/10/05,Michigan State,Lake Superior,6344273
7,2024/10/05,Minnesota State,Michigan,6344250
8,2024/10/05,Bemidji State,Minnesota Duluth,6344341
9,2024/10/05,Massachusetts,Bentley,6344346


### Create a New Column with Game_ID to match with the rest of the Database

In [29]:
# Function to create a unique Game_ID
def create_game_id(schedule_df):
    schedule_df['Game_ID'] = schedule_df.apply(
        lambda row: f"{row['Date'].replace('/', '-')}-{row['Away_Team']}-{row['Home_Team']}", axis=1
    )
    return schedule_df

# Call the function
schedule_df = create_game_id(schedule_df)

In [30]:
schedule_df.tail(20)

Unnamed: 0,Date,Home_Team,Away_Team,game_id_number,Game_ID
1137,2025/03/15,Notre Dame,Michigan State,6442668,2025-03-15-Michigan State-Notre Dame
1138,2025/03/15,Mass. Lowell,Maine,6443718,2025-03-15-Maine-Mass. Lowell
1139,2025/03/15,St Cloud State,Western Michigan,6444069,2025-03-15-Western Michigan-St Cloud State
1140,2025/03/15,Cornell,Colgate,6443575,2025-03-15-Colgate-Cornell
1141,2025/03/15,Army,Holy Cross,6443761,2025-03-15-Holy Cross-Army
1142,2025/03/15,Northeastern,Boston College,6443719,2025-03-15-Boston College-Northeastern
1143,2025/03/15,Bemidji State,Minnesota State,6442967,2025-03-15-Minnesota State-Bemidji State
1144,2025/03/15,North Dakota,Omaha,6444072,2025-03-15-Omaha-North Dakota
1145,2025/03/15,Colorado College,Denver,6444075,2025-03-15-Denver-Colorado College
1146,2025/03/15,Penn State,Ohio State,6442669,2025-03-15-Ohio State-Penn State


## Get Play By Play JSONs
- 

In [31]:
# breakpoint()

## Using Custom API to Call NCAA.com
- project developed by henrygd - https://github.com/henrygd/ncaa-api

Uses his custom built API to get JSON response from NCAA.com
- can host own server for large projects for now I am using his public link

In [32]:
import requests
from datetime import datetime
# 6344241

# Base URL for the custom API
base_url = "https://ncaa-api.henrygd.me/game"

# Function to get play-by-play JSON for a single game
def get_play_by_play(game_id_number):
    url = f"{base_url}/{game_id_number}/play-by-play"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for Game ID {game_id_number}: {e}")
        return None

# Function to fetch JSON data for all completed games
def fetch_play_by_play_data(schedule_df):
    # Filter for games that have already taken place
    schedule_df['Date'] = pd.to_datetime(schedule_df['Date'])
    today = pd.to_datetime(datetime.now().strftime('%Y-%m-%d'))
    completed_games = schedule_df[schedule_df['Date'] < today].copy()


    # Initialize a new column for play-by-play JSON
    completed_games['Play_By_Play_JSON'] = None

    for index, row in completed_games.iterrows():
        game_id_number = row['game_id_number']
        json_data = get_play_by_play(game_id_number)
        completed_games.at[index, 'Play_By_Play_JSON'] = json_data

    return completed_games


# Fetch and update the dataframe with play-by-play JSONs
updated_schedule_df = fetch_play_by_play_data(schedule_df)

# Check the updated dataframe
updated_schedule_df.tail()


Error fetching data for Game ID 6344537: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6344537/play-by-play
Error fetching data for Game ID 6382719: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6382719/play-by-play
Error fetching data for Game ID 6382720: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6382720/play-by-play
Error fetching data for Game ID 6383573: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6383573/play-by-play
Error fetching data for Game ID 6384205: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6384205/play-by-play
Error fetching data for Game ID 6443765: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6443765/play-by-play
Error fetching data for Game ID 6443762: 404 Client Error: Not Found for url: https://ncaa-api.henrygd.me/game/6443762/play-by-play


Unnamed: 0,Date,Home_Team,Away_Team,game_id_number,Game_ID,Play_By_Play_JSON
1147,2025-03-15,Minnesota Duluth,Arizona State,6444078,2025-03-15-Arizona State-Minnesota Duluth,{'inputMD5Sum': '8452ab2daed6cfeef77f99e2d011e...
1148,2025-03-16,Bentley,Sacred Heart,6443765,2025-03-16-Sacred Heart-Bentley,
1149,2025-03-16,Army,Holy Cross,6443762,2025-03-16-Holy Cross-Army,
1150,2025-03-16,Harvard,Clarkson,6443574,2025-03-16-Clarkson-Harvard,{'inputMD5Sum': '2eb0ed5b1c2f145243f73cda68596...
1151,2025-03-16,Colorado College,Denver,6444076,2025-03-16-Denver-Colorado College,{'inputMD5Sum': '5e54f0b126513461c06dbe2fc8a3a...


In [33]:
## Save the dataframe in a way that doesn't cut off the json data - CSV cuts off the json data
## Use Pickle
# Save the updated dataframe to a pickle file
updated_schedule_df.to_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))

## Load pickle file to avoid having to scrape again
# Load the updated dataframe from a pickle file
# updated_schedule_df = pd.read_pickle(os.path.join(temp_folder, 'schedule_with_play_by_play.pkl'))



### Transform Play by Play JSONs

#### Extract all decriptions into a text file to study
- Use this to make an abbr and alternate name dictionary for a find and replace

###### Team Standardization

##### Create team Map for name subsitutions

In [34]:
# Create team name mapping from school_info_df
team_mapping = {}
for _, row in school_info_df.iterrows():
    standard_name = row['Team']
    alternatives = [a.strip() for a in row['ncaa_data_alts'].split(',')]
    for alt in alternatives:
        team_mapping[alt.lower()] = standard_name

team_mapping
team_map = team_mapping


###### Parsing

In [35]:
# school_info_df.head()
# Create a dictionary mapping the alternate names (ncaa_data_alts) to the standardized names


In [36]:
updated_schedule_df

# Output Play by Play JSON to a raw text file
# Function to save JSON data to a text file
def save_json_to_file(json_data, file_path):
    with open(file_path, 'w') as file:
        json.dump(json_data, file)

## Call the function for the first row
save_json_to_file(updated_schedule_df.iloc[0]['Play_By_Play_JSON'], os.path.join(temp_folder, 'play_by_play.json'))


In [37]:
updated_schedule_df.tail()

Unnamed: 0,Date,Home_Team,Away_Team,game_id_number,Game_ID,Play_By_Play_JSON
1147,2025-03-15,Minnesota Duluth,Arizona State,6444078,2025-03-15-Arizona State-Minnesota Duluth,{'inputMD5Sum': '8452ab2daed6cfeef77f99e2d011e...
1148,2025-03-16,Bentley,Sacred Heart,6443765,2025-03-16-Sacred Heart-Bentley,
1149,2025-03-16,Army,Holy Cross,6443762,2025-03-16-Holy Cross-Army,
1150,2025-03-16,Harvard,Clarkson,6443574,2025-03-16-Clarkson-Harvard,{'inputMD5Sum': '2eb0ed5b1c2f145243f73cda68596...
1151,2025-03-16,Colorado College,Denver,6444076,2025-03-16-Denver-Colorado College,{'inputMD5Sum': '5e54f0b126513461c06dbe2fc8a3a...


#### ChatGPT attempt at parsing

In [38]:
### Decode the Description field in to deal with problems caused by special characters
def clean_description_encoding_safe(df, column='Description'):
    """
    Fixes encoding issues and malformed characters in the specified column safely.
    Ignores characters that cannot be decoded.
    """
    df[column] = df[column].apply(
        lambda x: x.encode('latin1', 'ignore').decode('utf-8', 'ignore') if isinstance(x, str) else x
    )
    return df

#### Take Spaces out of Multi-part Team Abbreviations

In [39]:
import re
import pandas as pd

## Function to remove spaces if they are between two capital letters, used to clean Description column before parsing
def remove_spaces_between_two_cap_pairs(df, column='Description'):
    """
    Removes spaces only if the pattern is two consecutive capital letters, 
    followed by a space, followed by another two consecutive capital letters.
    """
    df[column] = df[column].apply(lambda x: re.sub(r'([A-Z]{2})\s([A-Z]{2})', r'\1\2', x) if isinstance(x, str) else x)
    return df

#### Special Function to Deal with the special cases in the description column
## "Alas. Fairbanks" = "AKFBK"
## "Alas. Anchorage" = "AKANC"
## "St. Lawerence" = "STLAW"
## "St. Cloud St." = "SCS"
## "St. Thomas (MN)" = "STC"
## 

## Function to look for those exact strings and replace them with the correct abbreviation in Description column
def special_cases(df, column='Description'):
    df[column] = df[column].str.replace("Alas. Fairbanks", "AKFBK")
    df[column] = df[column].str.replace("Alas. Anchorage", "AKANC")
    df[column] = df[column].str.replace("St. Lawrence", "STLAW")
    df[column] = df[column].str.replace("St. Cloud St.", "SCS")
    df[column] = df[column].str.replace("St. Thomas (MN)", "STC")

    return df

### Special Player name cases
# "Santa juana" = "Santa-Juana"
# "St. louis' = "St-Louis"
# "Jamernik v" = "Jamernik"
# "Van houtte-cachero" = "Van-Houtte-Cachero"
# "Van why" = "Van-Why"
# "Gustafsson nyberg" = "Gustafsson-Nyberg"
# "La starza" = "La-Starza"
# "De la durantaye" = "De-La-Durantaye"



def special_player_cases(df, column='Description'):
    df[column] = df[column].str.replace("Santa juana", "Santa-Juana")
    df[column] = df[column].str.replace("St. louis", "St-Louis")
    df[column] = df[column].str.replace("St. Louis", "St-Louis")
    df[column] = df[column].str.replace("Jamernik v", "Jamernik")
    df[column] = df[column].str.replace("Van houtte-cachero", "Van-Houtte-Cachero")
    df[column] = df[column].str.replace("Van why", "Van-Why")
    df[column] = df[column].str.replace("Gustafsson nyberg", "Gustafsson-Nyberg")
    df[column] = df[column].str.replace("La starza", "La-Starza")
    df[column] = df[column].str.replace("De la durantaye", "De-La-Durantaye")
    df[column] = df[column].str.replace("de la durantaye", "De-La-Durantaye")

    return df





    

In [40]:
import pandas as pd
import re
import unicodedata

# Function to convert period and time to continuous time
### ORIGINAL FUNCTION WORKS FOR CURRENT (24-25) SEASON
# def convert_to_continuous_time(period, time):
#     period_offsets = {'1': 0, '2': 20, '3': 40, 'OT': 60}
#     minutes, seconds = map(int, time.split(':'))
#     elapsed_time = (20 - minutes) * 60 + -seconds
#     offset = period_offsets.get(period, 0) * 60
#     return offset + elapsed_time

#### HOTFIX FOR OLDER SEASONS BECAUSE THE TIME IS THE INVERSE OF THE OTHER FORMAT
def convert_to_continuous_time(period, time):
    period_offsets = {'1': 0, '2': 20, '3': 40, 'OT': 60}
    minutes, seconds = map(int, time.split(':'))
    elapsed_time = (minutes) * 60 + seconds
    offset = period_offsets.get(period, 0) * 60
    return offset + elapsed_time

# Function to normalize names to handle accents and special characters
def normalize_name(name):
    if not name:
        return None
    # Normalize Unicode accents and remove non-ASCII characters
    normalized = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    return normalized

# Enhanced player name formatting function
def clean_player_name(name):
    """
    Converts a name from "Last, First" to "First Last", handling punctuation and normalization.
    Example: "Hughes, T.J." -> "T.J. Hughes"
    """
    if not name:
        return None
    name = normalize_name(name)
    parts = [p.strip() for p in name.split(',')]
    if len(parts) == 2:
        last, first = parts
        return f"{first} {last}".strip()
    return name

# Function to parse play-by-play descriptions
# Improved to handle team abbreviations and player names with issues
def parse_description(description):
    """
    Parse a single play-by-play description into structured fields.
    """
    desc_lower = description.lower().strip()
    parsed = {
        "Event_type": "Other",
        "Primary_player": None,
        "Primary_team": None,
        "Secondary_player": None,
        "Secondary_team": None,
        "Outcome": None,
    }

    # Normalize known team abbreviations
    team_map = {
        'michst': 'MICHST',
        'lake sup': 'LK SUP',
        'lk sup': 'LK SUP',
        'michigan state': 'MICHST',
        'lake superior': 'LK SUP'
    }

    for key, value in team_map.items():
        desc_lower = desc_lower.replace(key, value.lower())

    # --- Faceoff ---
    if "faceoff" in desc_lower:
        parsed["Event_type"] = "Faceoff"
        faceoff_pattern = (
            r"Faceoff\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"vs\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+"
            r"won by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\."
        )
        match = re.search(faceoff_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Secondary_player"] = clean_player_name(match.group(2))
            parsed["Primary_team"] = match.group(3).strip()
            parsed["Outcome"] = "won"
        return parsed

    # --- Goal ---
    if "goal by" in desc_lower:
        parsed["Event_type"] = "Goal"
        goal_scorer_pattern = r"Goal by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)"
        match = re.search(goal_scorer_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
        return parsed

    # --- Penalty ---
    if desc_lower.startswith("penalty on"):
        parsed["Event_type"] = "Penalty"
        penalty_pattern = (
            r"Penalty on\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+, [A-Za-zÀ-ÖØ-ÿ'\.\- ]+)\s+"
            r"([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(\d+) minutes for (.+)"
        )
        match = re.search(penalty_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
            parsed["Penalty_duration"] = match.group(3).strip()
            parsed["Penalty_type"] = match.group(4).strip()
        return parsed

    # --- Shot ---
    if "shot by" in desc_lower:
        parsed["Event_type"] = "Shot"
        shot_pattern = r"Shot by\s+([A-Za-zÀ-ÖØ-öø-ÿ'\.\- ]+)\s+(.+)"
        match = re.search(shot_pattern, description, re.IGNORECASE)
        if match:
            parsed["Primary_player"] = clean_player_name(match.group(1))
            parsed["Primary_team"] = match.group(2).strip()
        return parsed

    return parsed

# Function to transform a single game's JSON data into a dataframe
def transform_single_game(json_data, game_id):
    rows = []

    for period in json_data['periods']:
        period_number = period['periodNumber']
        for play in period['playStats']:
            row = {
                'Game_ID': game_id,
                'Period': period_number,
                'Time': play['time'],
                'Description': play['visitorText'] or play['homeText'],
                'Score': play['score']
            }
            rows.append(row)
    

    game_df = pd.DataFrame(rows)
    
    # Convert period and time to continuous time
    game_df['Period'] = game_df['Period'].replace({'1st': '1', '2nd': '2', '3rd': '3', 'OT': 'OT'})
    game_df['Time'] = game_df.apply(lambda row: convert_to_continuous_time(row['Period'], row['Time']), axis=1)
    # Apply function to deal with special characters in the Description column
    clean_description_encoding_safe(game_df, 'Description')
    # Apply Function to deal with player name special cases (last names with spaces)
    special_player_cases(game_df, 'Description')
    # Apply the function to make special supstitutions in the Description column
    game_df = special_cases(game_df, 'Description')
    ## Apply the modified function to clean Team names in the 'Description' column
    game_df = remove_spaces_between_two_cap_pairs(game_df, 'Description')

    # Parse descriptions
    parsed_descriptions = game_df['Description'].apply(parse_description)
    parsed_df = pd.DataFrame(parsed_descriptions.tolist())

    # Combine with original game_df
    return pd.concat([game_df, parsed_df], axis=1)

# Function to process all games and combine into a single dataframe
def process_all_games(schedule_df):
    all_games = []

    for _, row in schedule_df.iterrows():
        game_id = row['Game_ID']
        json_data = row['Play_By_Play_JSON']

        if json_data:
            game_df = transform_single_game(json_data, game_id)
            all_games.append(game_df)

    return pd.concat(all_games, ignore_index=True)

# Example usage
# Assuming `updated_schedule_df` is the dataframe containing the JSON play-by-play data
final_pbp_df = process_all_games(updated_schedule_df)

# Display the resulting dataframe
# final_pbp_df.head(12)

In [41]:
## look at tail
# final_pbp_df.tail(12)

In [42]:
# Function to handle 'SAVE' case
def move_save(row):
    if pd.notnull(row['Primary_team']) and ', save' in row['Primary_team']:
        row['Secondary_player'] = row['Primary_team'].split(', save')[1].strip()
        row['Primary_team'] = row['Primary_team'].split(', save')[0].strip()


        row['Primary_team'] = row['Primary_team'].replace(', save', '').strip()
    return row

# Function to handle 'BLOCKED' case
def move_blocked(row):
    if pd.notnull(row['Primary_team']) and 'BLOCKED' in row['Primary_team']:
        blocked_match = re.search(r'BLOCKED by (.+)', row['Primary_team'])
        if blocked_match:
            row['Secondary_player'] = blocked_match.group(1).strip()
            row['Primary_team'] = re.sub(r'BLOCKED by .+', 'BLOCKED', row['Primary_team']).strip()
    return row

# Function to extract and move the outcome to 'Outcome' column
def move_outcome(row):
    if pd.notnull(row['Primary_team']):
        outcome_match = re.search(r'\b(MISSED|WIDE|BLOCKED|SAVE)\b', row['Primary_team'])
        if outcome_match:
            row['Outcome'] = outcome_match.group(1)
            row['Primary_team'] = re.sub(r'\b(MISSED|WIDE|BLOCKED|SAVE)\b', '', row['Primary_team']).strip()
    return row

# Apply transformations sequentially
final_pbp_df = final_pbp_df.apply(move_save, axis=1)
final_pbp_df = final_pbp_df.apply(move_blocked, axis=1)
final_pbp_df = final_pbp_df.apply(move_outcome, axis=1)

# # Display the first few rows of the cleaned dataframe


# # # Notes for second step of transformation
# # # Faceoff Seem to be working as intended
# # # Goal - Primary_player is working as intended
# #     # - Primary Team is not being captured - probably because the team name is used and not the abbreviation
# # # Shots - Primary_player actually contains the team abbreviation
# #     # - Primary_team includes the player name and still includes the shot outcome WIDE, BLOCKED, MISSED in the
# #     #  - UPPER CASE - Need to remove the outcome from the team name and move to outcome column
# #     #  - MISSED IS THE SAME AS SAVED - also includes the goalie name after the outcome
# #     #  - BLOCKED also includes the secondary player name after the outcome

In [43]:
### Next Steps 
# For all Event_type: Shot swap the Primary_player and Primary_team values

# Function to swap 'Primary_player' and 'Primary_team' for 'Shot' events
def swap_shot_columns(row):
    if row['Event_type'] == 'Shot':
        row['Primary_player'], row['Primary_team'] = row['Primary_team'], row['Primary_player']
    return row

# Apply the function to the dataframe
final_pbp_df = final_pbp_df.apply(swap_shot_columns, axis=1)





In [44]:
## Need to deal with foriegn names like Tommi Mannisto (which has accents and appears like MÃ£Â„nnistÃ£Â–, Tommi. in the data

def fix_encoding_issues(df, columns):
    """
    Fix encoding issues in specified columns of a dataframe.
    
    Args:
        df (pd.DataFrame): The dataframe containing columns with text issues.
        columns (list): List of column names to fix.
        
    Returns:
        pd.DataFrame: The dataframe with fixed text in specified columns.
    """
    def decode_text(text):
        try:
            # Decode from 'latin1' and re-encode to 'utf-8'
            return text.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError, AttributeError):
            # Return text as is if decoding fails
            return text
    
    for col in columns:
        df[col] = df[col].apply(decode_text)
    
    return df

# Apply the function to the dataframe
columns_to_fix = ['Primary_player', 'Secondary_player']

final_pbp_df = fix_encoding_issues(final_pbp_df, columns_to_fix)# Display the first few rows of the cleaned dataframe
# final_pbp_df.head(12)



In [45]:
def standardize_names(df, columns):
    """
    Standardize player names in the specified columns to 'First Last' format.
    
    Args:
        df (pd.DataFrame): The dataframe containing player name columns.
        columns (list): List of column names to standardize.
        
    Returns:
        pd.DataFrame: The dataframe with standardized player names.
    """
    def fix_name_format(name):
        if pd.isnull(name):  # Handle missing values
            return name
        name = name.replace(".", "")  # Remove periods
        if "," in name:  # If the name is in 'Last, First' format
            parts = name.split(",")
            return f"{parts[1].strip()} {parts[0].strip()}"  # Rearrange to 'First Last'
        return name.strip()  # Return as is if already in 'First Last' format

    for col in columns:
        df[col] = df[col].apply(fix_name_format)
    
    return df

# Apply the function to the dataframe
final_pbp_df = standardize_names(final_pbp_df, columns_to_fix)
# Display the first few rows of the cleaned dataframe
# final_pbp_df.head(12)

In [46]:
import pandas as pd
import re

def classify_power_play_events(df):
    """
    Classify 'Other' Event_type as 'PP - Start' or 'PP - End' based on the Description,
    and extract the team abbreviation to the Primary_team column.
    
    Args:
        df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', and 'Primary_team' columns.
        
    Returns:
        pd.DataFrame: The updated dataframe with classified 'Event_type' and filled 'Primary_team'.
    """
    def classify_event(row):
        if row['Event_type'] == 'Other':
            description = row['Description']
            # Check for "Start power play for"
            if re.search(r"Start power play for", description):
                row['Event_type'] = 'PP - Start'
                row['Primary_team'] = description.split('for')[-1].strip().rstrip('.')
            # Check for "End power play for"
            elif re.search(r"End power play for", description):
                row['Event_type'] = 'PP - End'
                row['Primary_team'] = description.split('for')[-1].strip().rstrip('.')
        return row

    # Apply the classification function row-wise
    df = df.apply(classify_event, axis=1)
    return df

# Apply the function to classify power play events
final_pbp_df = classify_power_play_events(final_pbp_df)

# Display the first few rows of the updated dataframe
# final_pbp_df.head(22)



In [47]:
# Examine the data
final_pbp_df.info()
# Value counts
# final_pbp_df['Event_type'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227409 entries, 0 to 227408
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Game_ID           227409 non-null  object
 1   Period            227409 non-null  object
 2   Time              227409 non-null  int64 
 3   Description       227409 non-null  object
 4   Score             227409 non-null  object
 5   Event_type        227409 non-null  object
 6   Primary_player    198085 non-null  object
 7   Primary_team      205776 non-null  object
 8   Secondary_player  156638 non-null  object
 9   Secondary_team    0 non-null       object
 10  Outcome           183990 non-null  object
 11  Penalty_duration  7947 non-null    object
 12  Penalty_type      7947 non-null    object
dtypes: int64(1), object(12)
memory usage: 22.6+ MB


### Classify Media Time out Specifically
Description = 'Media time out.' ==> Event_type = Media TO

In [48]:
### Classify Media Time out Specifically
# If Description = 'Media time out.' ==> Event_type = Media TO

# Function to classify 'Media TO' events

def classify_event(row):
    if row['Event_type'] == 'Other' and row['Description'] == 'Media time out.':
        row['Event_type'] = 'Media TO'
    return row


# Apply the classification function row-wise
final_pbp_df = final_pbp_df.apply(classify_event, axis=1)



In [49]:
# Show Media TO Events
# final_pbp_df[final_pbp_df['Event_type'] == 'Media TO']

##### Deal with Goalie change / info rows

In [50]:
import pandas as pd
import re

def classify_goalie_moves(df):
    """
    Parse goalie moves from the Description column and classify them as 'Goalie Move'.
    Extract the goalie name as Primary_player and the team name as Primary_team.
    
    Args:
        df (pd.DataFrame): The dataframe containing 'Event_type', 'Description', 'Primary_player', and 'Primary_team' columns.
        
    Returns:
        pd.DataFrame: The updated dataframe with classified 'Event_type', 'Primary_player', and 'Primary_team'.
    """
    def parse_goalie_move(row):
        if row['Event_type'] == 'Other':  # Only process rows marked as 'Other'
            description = row['Description']
            # Look for patterns like 'Name at goalie for Team'
            match = re.match(r"(.+?) at goalie for (.+?)\.", description)
            if match:
                row['Event_type'] = 'Goalie Move'
                row['Primary_player'] = match.group(1).strip()  # Extract the goalie's name
                row['Primary_team'] = match.group(2).strip()  # Extract the team name
        return row

    # Apply the parsing function row-wise
    df = df.apply(parse_goalie_move, axis=1)
    return df

# Apply the function to classify goalie moves
final_pbp_df = classify_goalie_moves(final_pbp_df)



In [51]:
## In the Outcome column relabel MISSED to SAVED for clairty
def relabel_missed_to_saved(df):
    """
    Relabel 'MISSED' to 'SAVED' in the Outcome column.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Outcome column.
        
    Returns:
        pd.DataFrame: The updated dataframe with relabeled outcomes.
    """
    df['Outcome'] = df['Outcome'].replace('MISSED', 'SAVED')
    return df

# Apply the relabeling function
final_pbp_df = relabel_missed_to_saved(final_pbp_df)

In [52]:
# Clean the Penalty_type column to remove periods and any leading/trailing whitespace
def clean_penalty_type(df):
    """
    Clean the Penalty_type column by removing periods and extra whitespace.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Penalty_type column.
        
    Returns:
        pd.DataFrame: The updated dataframe with cleaned Penalty_type.
    """
    df['Penalty_type'] = df['Penalty_type'].str.replace('.', '', regex=False).str.strip()
    return df

# Apply the cleaning function
final_pbp_df = clean_penalty_type(final_pbp_df)

In [53]:
## Show 5 goal events
final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(5)


Unnamed: 0,Game_ID,Period,Time,Description,Score,Event_type,Primary_player,Primary_team,Secondary_player,Secondary_team,Outcome,Penalty_duration,Penalty_type
36,2024-10-04-Lake Superior-Michigan State,1,363,"Goal by Howard, Isaac (EVENSTRENGTH, FIRSTGOAL...",1-0,Goal,Isaac Howard,,,,,,
40,2024-10-04-Lake Superior-Michigan State,1,240,"Goal by Milburn, Connor (EVENSTRENGTH) Assist ...",1-1,Goal,Connor Milburn,,,,,,
186,2024-10-04-Lake Superior-Michigan State,4,261,"Goal by Russell, Daniel (EVENSTRENGTH, OVERTIM...",2-1,Goal,Daniel Russell,,,,,,
256,2024-10-04-Michigan-Minnesota State,2,2367,"Goal by Whitelaw, William (POWER-PLAY, FIRSTGO...",0-1,Goal,William Whitelaw,,,,,,
268,2024-10-04-Michigan-Minnesota State,2,2135,"Goal by Carrabes, Brian (EVENSTRENGTH) Assist ...",1-1,Goal,Brian Carrabes,,,,,,


## Start Wed 1-29
- clean the goal events

In [54]:
### Grab the Goal Conditions within the parenthesis in the definition
### Put in a new column called Goal_Conditions
def extract_goal_conditions(df):
    """
    Extract goal conditions from the Description column and add them to a new column called Goal_Conditions.
    
    Args:
        df (pd.DataFrame): The dataframe containing the Description column.
        
    Returns:
        pd.DataFrame: The updated dataframe with the Goal_Conditions column.
    """
    def extract_conditions(description):
        match = re.search(r'\((.*?)\)', description)
        return match.group(1) if match else None

    df['Goal_Conditions'] = df['Description'].apply(extract_conditions)
    return df

# Apply the extraction function
final_pbp_df = extract_goal_conditions(final_pbp_df)

In [55]:
# Filter to just Goal events to check the Goal_Conditions column
final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(5)




Unnamed: 0,Game_ID,Period,Time,Description,Score,Event_type,Primary_player,Primary_team,Secondary_player,Secondary_team,Outcome,Penalty_duration,Penalty_type,Goal_Conditions
36,2024-10-04-Lake Superior-Michigan State,1,363,"Goal by Howard, Isaac (EVENSTRENGTH, FIRSTGOAL...",1-0,Goal,Isaac Howard,,,,,,,"EVENSTRENGTH, FIRSTGOAL"
40,2024-10-04-Lake Superior-Michigan State,1,240,"Goal by Milburn, Connor (EVENSTRENGTH) Assist ...",1-1,Goal,Connor Milburn,,,,,,,EVENSTRENGTH
186,2024-10-04-Lake Superior-Michigan State,4,261,"Goal by Russell, Daniel (EVENSTRENGTH, OVERTIM...",2-1,Goal,Daniel Russell,,,,,,,"EVENSTRENGTH, OVERTIME, SUDDENDEATH, GAMEWINNI..."
256,2024-10-04-Michigan-Minnesota State,2,2367,"Goal by Whitelaw, William (POWER-PLAY, FIRSTGO...",0-1,Goal,William Whitelaw,,,,,,,"POWER-PLAY, FIRSTGOAL"
268,2024-10-04-Michigan-Minnesota State,2,2135,"Goal by Carrabes, Brian (EVENSTRENGTH) Assist ...",1-1,Goal,Brian Carrabes,,,,,,,EVENSTRENGTH


In [56]:
def assign_primary_team(df):
    """
    Assigns the correct 'Primary_team' to goal events by analyzing score changes.
    """
    # Extract away and home team names from Game_ID
    def extract_teams(game_id):
        parts = game_id.split('-')
        away_team = parts[3]
        home_team = parts[4]
        return away_team, home_team

    # Filter only goal events
    goal_df = df[df['Event_type'] == 'Goal'].copy()

    # Sort by game and time sequence
    goal_df.sort_values(by=['Game_ID', 'Period', 'Time'], inplace=True)

    # Initialize previous scores dictionary
    prev_scores = {}

    # Intiaalize away_score and home_score with default values

    # Iterate over goal events
    for idx, row in goal_df.iterrows():
        game_id = row['Game_ID']
        score_str = row['Score']

        # Intiaalize away_score and home_score with default values
        away_score, home_score = 0, 0
        
        if pd.isna(score_str):
            continue

        # Parse score into integers Unless the string is empty
        if score_str:
            away_score, home_score = map(int, score_str.split('-'))
        else:
            None

        # away_score, home_score = map(int, score_str.split('-'))

        # Extract teams
        away_team, home_team = extract_teams(game_id)

        # Check previous score to determine which team scored
        if game_id in prev_scores:
            prev_away, prev_home = prev_scores[game_id]

            if away_score > prev_away:
                goal_df.at[idx, 'Primary_team'] = away_team
            elif home_score > prev_home:
                goal_df.at[idx, 'Primary_team'] = home_team
        else:
            # First goal of the game, determine scorer by score value
            if away_score > home_score:
                goal_df.at[idx, 'Primary_team'] = away_team
            else:
                goal_df.at[idx, 'Primary_team'] = home_team

        # Update previous score
        prev_scores[game_id] = (away_score, home_score)

    # Merge updated Primary_team back into original dataframe
    df.update(goal_df[['Primary_team']])
    return df

# Apply function to the dataframe
final_pbp_df = assign_primary_team(final_pbp_df)

In [57]:
# final_pbp_df[final_pbp_df['Event_type'] == 'Goal'].head(15)

In [58]:
#### TEAM_MAP is in memory (created earlier from school_info_df)
### This block and be used to add additional team mappings before doing the substitution

team_map['lksup'] = 'Lake Superior'
team_map['miaoh'] = 'Miami'
team_map['azst'] = 'Arizona State'
team_map['akanc'] = 'Alaska Anchorage'
team_map['akfbk'] = 'Alaska'
team_map['amint'] = 'American Intl'
team_map['augsd'] = 'Augustana'
team_map['bsu'] = 'Bemidji State'
team_map['cocol'] = 'Colorado College'
team_map['maine'] = 'Maine'
team_map['mndul'] = 'Minnesota Duluth'
team_map['nodak'] = 'North Dakota'
team_map['stlaw'] = 'St. Lawrence'
team_map['psu'] = 'Penn State'








### ORIGINAL MAP BEFORE I ADDED THE STEP TO REMOVE SPACES BETWEEN TWO CAPS
# ### NEED TO ADDRESS MINNESOTA DUTITH, Amerrican International, St Lawerence - THE PARSING IS COMPLETELY FAILING IN AT LEAST SOME OF THEIR GAME
# #### ST is also a mess with mutiple teams . Alas is also showing up for bothe Alaska teams
# # add 'michigan st': 'Michigan State', to the team_map

team_map['michigan st'] = 'Michigan State'
team_map['linwod'] = 'Lindenwood'
team_map['sup'] = 'Lake Superior'
team_map['afa'] = 'Air Force'
team_map['anc'] = 'Alaska Anchorage'
team_map['asu'] = 'Arizona State'
team_map['aug'] = 'Augustana'
team_map['ben'] = 'Bentley'
# team_map['bgsu santa'] = 'Bowling Green'
# team_map['brown st.'] = 'Brown'
team_map['can'] = 'Canisius'
team_map['clk'] = 'Clarkson'
team_map['col'] = 'Colgate'
team_map['dak'] = 'North Dakota'
team_map['dame'] = 'Notre Dame'
team_map['den'] = 'Denver'
team_map['dul'] = 'Minnesota Duluth'
team_map['fsu'] = 'Ferris State'
team_map['har'] = 'Harvard'
# team_map['har st.'] = 'Harvard'
team_map['int'] = 'American Intl'
team_map['lin'] = 'Lindenwood'
team_map['lwu'] = 'Lindenwood'
team_map['mai'] = 'Maine'
team_map['mer'] = 'Mercyhurst'
# team_map['michst a'] = 'Michigan State'
# team_map['minn pa'] = 'Minnesota'
team_map['neu'] = 'Northeastern'
team_map['no dak jamernik'] = 'North Dakota'
team_map['oh'] = 'Miami'
# team_map['omaha van'] = 'Omaha'
team_map['pri'] = 'Princeton'
team_map['prince'] = 'Princeton'
# team_map['pri de la'] = 'Princeton'
# team_map['prince de la'] = 'Princeton'
team_map['qui'] = 'Quinnipiac'
team_map['scs'] = 'St. Cloud State'
team_map['sd'] = 'Augustana'
team_map['shu'] = 'Sacred Heart'
team_map['slu'] = 'St. Lawrence'
team_map['stc'] = 'Stonehill'
team_map['sup'] = 'Lake Superior'
team_map['u-m'] = 'Michigan'
team_map['uma'] = 'Massachusetts'
team_map['umd'] = 'Minnesota Duluth'
team_map['und'] = 'Notre Dame'
team_map['uni'] = 'Union'
team_map['ust'] = 'St. Thomas'
# team_map['vermnt la'] = 'Vermont'
team_map['wis'] = 'Wisconsin'
team_map['wmu'] = 'Western Michigan'
team_map['yal'] = 'Yale'











# team_map

In [59]:
## Standardize Team names in Primary Team column

# Firs by replaceing abbreviations with full names
def standardize_primary_team(df, team_map):
    """
    Standardizes the 'Primary_team' column using the provided team_map.
    """
    # Convert to lowercase and map to standardized names
    df['Primary_team'] = df['Primary_team'].str.lower().map(team_map).fillna(df['Primary_team'])

    return df

# Run the function to standardize the 'Primary_team' column
final_pbp_df = standardize_primary_team(final_pbp_df, team_map)

In [60]:
## Standardize Team names in Primary Team column PART 2
## Deal with teams with two slightly different names and standardize to CHN names
# "Alaska Fairbanks" = "Alaska"
# "Arizona St" = "Arizona State"
# "Bemidji St" = "Bemidji State"
# "Boston U" = "Boston University"
# "Colorado Col" = "Colorado College"
# "Ferris St" = "Ferris State"
# Lake Superior St" = "Lake Superior"
# "Mass. Lowell" = "Mass Lowell"
# "Minnesota St" = "Minnesota State"
# Northern Mich = "Northern Michigan"
# Ohio St = "Ohio State"
# Penn St = "Penn State"
# Western Mich = "Western Michigan"
# St Cloud State = "St. Cloud State"

# Function to make the substitutions
def standardize_team_names(df):
    """
    Standardizes team names in the 'Primary_team' column.
    """
    # Define team name substitutions
    team_substitutions = {
        'Alaska Fairbanks': 'Alaska',
        'Arizona St': 'Arizona State',
        'Bemidji St': 'Bemidji State',
        'Boston U': 'Boston University',
        'Colorado Col': 'Colorado College',
        'Ferris St': 'Ferris State',
        'Lake Superior St': 'Lake Superior',
        'Mass. Lowell': 'Mass Lowell',
        'Minnesota St': 'Minnesota State',
        'Northern Mich': 'Northern Michigan',
        'Ohio St': 'Ohio State',
        'Penn St': 'Penn State',
        'Western Mich': 'Western Michigan',
        'St Cloud State': 'St. Cloud State'
    }

    # Apply the substitutions
    df['Primary_team'] = df['Primary_team'].replace(team_substitutions)

    return df

# Run the function to standardize team names
final_pbp_df = standardize_team_names(final_pbp_df)

    





In [61]:
# final_pbp_df.sample(15)

In [62]:
## Save the dataframe to a CSV file
final_pbp_df.to_csv(os.path.join(temp_folder, output_filename), index=False)
final_pbp_df.to_csv(os.path.join(data_folder, output_filename), index=False)

In [63]:
### Save Updated Schedule DF (With PbP JSONs) to csv to avoid scraping for new tests

updated_schedule_df.to_csv(os.path.join(data_folder, 'schedule_from_ncaa_with_PbP_JSON.csv'), index=False)


In [64]:
### Examine the final dataframe
final_pbp_df.info()

# Value counts
final_pbp_df['Event_type'].value_counts()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227409 entries, 0 to 227408
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Game_ID           227409 non-null  object
 1   Period            227409 non-null  object
 2   Time              227409 non-null  int64 
 3   Description       227409 non-null  object
 4   Score             227409 non-null  object
 5   Event_type        227409 non-null  object
 6   Primary_player    203337 non-null  object
 7   Primary_team      214021 non-null  object
 8   Secondary_player  156638 non-null  object
 9   Secondary_team    0 non-null       object
 10  Outcome           183990 non-null  object
 11  Penalty_duration  7947 non-null    object
 12  Penalty_type      7947 non-null    object
 13  Goal_Conditions   6393 non-null    object
dtypes: int64(1), object(13)
memory usage: 24.3+ MB


Event_type
Shot           116111
Faceoff         67916
Penalty          8194
PP - Start       6972
PP - End         6861
Goal             6145
Other            5955
Goalie Move      5252
Media TO         4003
Name: count, dtype: int64