In [8]:
## New book to streamline the process of scraping data from the MHSAA website

# Dependencies

import pandas as pd
import numpy as np
import requests
import json
import re
import time

## Create the 2 functions that perform the parse

# Function to parse game data
def parse_game_data(data, game):
    # Extract the team and league information
    team_name = data["Record"]["TeamName"]
    team_id = data["Record"]["SchoolSportTeamId"]
    league_name = data["League"]["Name"]

    # Extract the opponentId and opponentName
    opponent_id = game["Opponents"][0]["SportTeamId"] if game["Opponents"] else None
    opponent_name = game["Opponents"][0]["PopularName"] if game["Opponents"] else None

    # Extract the gameDate and gameTime
    game_date = game["StartDate"]
    game_time = game["TimeText"]

    # Extract homeOrAway
    home_or_away = game["HomeAwayCode"]

    # Extract location info
    location = game["ContestLocationLink"]

    # Extract teamScore, opponentScore, and notes
    team_score = None
    opponent_score = None
    notes = None
    score_text = game["ScoreText"]
    if score_text:
        # Use regular expression to find scores and notes
        match = re.match(r"(\d+)-(\d+)(.*)", score_text)
        if match:
            team_score, opponent_score, notes = match.groups()
            # Convert scores to integers
            team_score = int(team_score)
            opponent_score = int(opponent_score)
            # Trim whitespace from notes
            notes = notes.strip()

    # Extract additional info
    contest_type = game.get("ContestType")
    season_type = game.get("SeasonType")
    post_season_info = game.get("PostSeasonInfo")
    tournament_info = game.get("TournamentInfo")
    tournament_name = game.get("TournamentName")
    tournament_type = game.get("TournamentType")
    contest_name = game.get("ContestName")
    season_type_code = game.get("SeasonTypeCode")

    return {
        "teamName": team_name,
        "teamId": team_id,
        "leagueName": league_name,
        "opponentName": opponent_name,
        "opponentId": opponent_id,
        "gameDate": game_date,
        "gameTime": game_time,
        "homeOrAway": home_or_away,
        "location": location,
        "teamScore": team_score,
        "opponentScore": opponent_score,
        "notes": notes,
        "contestType": contest_type,
        "seasonType": season_type,
        "postSeasonInfo": post_season_info,
        "tournamentInfo": tournament_info,
        "tournamentName": tournament_name,
        "tournamentType": tournament_type,
        "contestName": contest_name,
        "seasonTypeCode": season_type_code
    }


def initialize_parsing(base_url, team_id_start, team_id_end, url_end):
    # Initialize an empty list to hold the parsed data
    parsed_data = []

    # Loop over the range of team IDs
    for i in range(team_id_start, team_id_end+1):
        # Convert the id to a string
        id_str = str(i)
        # Build the URL
        full_url = base_url + id_str + url_end
        # Make the request
        response = requests.get(full_url)

        # Only proceed if the response status code is 200 (HTTP OK)
        if response.status_code == 200:
            # Convert the response to json
            data = response.json()

            # Check to see if the response is valid. If it is, parse the game data
            if 'Contests' in data and data['Contests']:  # Check if 'Contests' key exists and its value is not empty
                for game in data['Contests']:
                    parsed_data.append(parse_game_data(data, game))

    return parsed_data

In [10]:
import pandas as pd
from tqdm import tqdm
import time
from datetime import datetime

## Target years to scrape
years = [2022, 2021, 2019, 2018, 2017,  2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ]

## url builder
base_url = 'https://my.mhsaa.com/DesktopModules/MHSAA-Async-SportTeamSchedule/Endpoint.ashx?&method=schedules&orgID='
mid_url = '&sportTypeCode=BA&gender=M&level=V&year='
end_url = '&userid=-1'

# Start of team IDs
team_id_start = 3000
# End of team IDs
team_id_end = 3999

# Initialize a timer for the total execution time
total_start_time = time.time()

# Create a dictionary to store dataframes
df_dict = {}

# Loop over the years
for year in years:
    # Initialize a timer for the year's execution time
    year_start_time = time.time()

    # Initialize the parsing for the given year and team ID range
    parsed_data = initialize_parsing(base_url, team_id_start, team_id_end, mid_url + str(year) + end_url)

    # store the parsed data as DataFrame
    df = pd.DataFrame(parsed_data)

    # Store a copy of the DataFrame in the dictionary
    df_dict[year] = df.copy()
   
    # Generate timestamp
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    # Save the DataFrame to a CSV file in /TEMP/ directory, using year+1 in filename and current timestamp
    df.to_csv(f"TEMP/{year+1}_{timestamp}.csv", index=False)

    # Print the year's execution time in minutes and seconds
    print(f"Year {year+1} took {int((time.time() - year_start_time)/60)} minutes and {int((time.time() - year_start_time)%60)} seconds")
    
    # print the number of valid records for the year
    # print(f"Number of valid records: {len(df[df['teamName'].notnull()])}")
    # print total number of contests for the year
    print(f"Total number of contests: {len(df)}")

# Print the total execution time
print(f"Total time taken: {int((time.time() - total_start_time)/60)} minutes and {int((time.time() - total_start_time)%60)} seconds")




Year 2023 took 5 minutes and 12 seconds
Total number of contests: 4298
Year 2022 took 4 minutes and 56 seconds
Total number of contests: 4081
Year 2001 took 4 minutes and 30 seconds
Total number of contests: 0
Year 2019 took 4 minutes and 42 seconds
Total number of contests: 3776
Year 2018 took 4 minutes and 52 seconds
Total number of contests: 3500
Year 2017 took 4 minutes and 41 seconds
Total number of contests: 3705
Year 2016 took 6 minutes and 58 seconds
Total number of contests: 3303


In [None]:
## Target years to scrape
years = [2021, 2020, 2018, 2017,  2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ]

# Seems to have less and less data as the years go back
# Before 2008 the urls are still valid but there doesn't seem to be much or any data

# For now just try to go back to 2008



In [None]:
## url builder

base_url = 'https://my.mhsaa.com/DesktopModules/MHSAA-Async-SportTeamSchedule/Endpoint.ashx?&method=schedules&orgID='
# Then team ID

mid_url = '&sportTypeCode=BA&gender=M&level=V&year='

# Then year (year is going to corispond to the start of the school year 2021 will return the 2021-2022 school year)

end_url = '&userid=-1'


# Most if not all schools in michigan have ids from 3000 to 3999

# There are gaps in the ids
# There also may be some ids from out of state schools outside of that range
# The team id identifies the same school for all years

# Challenge when testing for valid IDs is that the site will return a response even if there is no data
# It returns a record but the record is empty

## Using Mason HS (id #3969) as a test case, data goes back to 2010 season (2009 school year)

