In [2]:
## New book to streamline the process of scraping data from the MHSAA website
## Scrape the MHSAA website using the list of school IDs

# Dependencies

import pandas as pd
import numpy as np
import requests
import json
import re
import time

In [3]:
#load the csv WITH INFO ABOUT WHICH SCHOOL IDS ARE VALID

df = pd.read_csv('school_data_scrape_READ.csv')

## Drop rows without a PopularName
df = df.dropna(subset=['PopularName'])

# Drop any records where IsMiddleSchool is true
df = df[df['IsMiddleSchool'] == False]

# Remove the 'IsMiddleSchool' column
df = df.drop(columns=['IsMiddleSchool'])

# keep just the rows with State = MI
df = df[df['State'] == 'MI']

# Rename ClassificationCalculationSteps to ClassificationNotes
df = df.rename(columns={'ClassificationCalculationSteps': 'ClassificationNotes'})

## cREATE a list of valid schoolId
valid_school_ids = df['SchoolId'].tolist()

# df.info()

len(valid_school_ids)

## sample valid school ids
valid_school_ids[0:10]

[3834, 3835, 3836, 3837, 3838, 3839, 3840, 3841, 3842, 3843]

In [4]:


## Create the 2 functions that perform the parse

# Function to parse game data
def parse_game_data(data, game):
    # Extract the team and league information
    team_name = data["Record"]["TeamName"]
    team_id = data["Record"]["SchoolSportTeamId"]
    league_name = data["League"]["Name"]

    # Extract the opponentId and opponentName
    opponent_id = game["Opponents"][0]["SportTeamId"] if game["Opponents"] else None
    opponent_name = game["Opponents"][0]["PopularName"] if game["Opponents"] else None

    # Extract the gameDate and gameTime
    game_date = game["StartDate"]
    game_time = game["TimeText"]

    # Extract homeOrAway
    home_or_away = game["HomeAwayCode"]

    # Extract location info
    location = game["ContestLocationLink"]

    # Extract teamScore, opponentScore, and notes
    team_score = None
    opponent_score = None
    notes = None
    score_text = game["ScoreText"]
    if score_text:
        # Use regular expression to find scores and notes
        match = re.match(r"(\d+)-(\d+)(.*)", score_text)
        if match:
            team_score, opponent_score, notes = match.groups()
            # Convert scores to integers
            team_score = int(team_score)
            opponent_score = int(opponent_score)
            # Trim whitespace from notes
            notes = notes.strip()

    # Extract additional info
    contest_type = game.get("ContestType")
    season_type = game.get("SeasonType")
    post_season_info = game.get("PostSeasonInfo")
    tournament_info = game.get("TournamentInfo")
    tournament_name = game.get("TournamentName")
    tournament_type = game.get("TournamentType")
    contest_name = game.get("ContestName")
    season_type_code = game.get("SeasonTypeCode")

    return {
        "teamName": team_name,
        "teamId": team_id,
        "leagueName": league_name,
        "opponentName": opponent_name,
        "opponentId": opponent_id,
        "gameDate": game_date,
        "gameTime": game_time,
        "homeOrAway": home_or_away,
        "location": location,
        "teamScore": team_score,
        "opponentScore": opponent_score,
        "notes": notes,
        "contestType": contest_type,
        "seasonType": season_type,
        "postSeasonInfo": post_season_info,
        "tournamentInfo": tournament_info,
        "tournamentName": tournament_name,
        "tournamentType": tournament_type,
        "contestName": contest_name,
        "seasonTypeCode": season_type_code
    }


def initialize_parsing(base_url, school_id, url_end):
    # Initialize an empty list to hold the parsed data
    parsed_data = []
    
    # Convert the id to a string
    id_str = str(school_id)
    # Build the URL
    full_url = base_url + id_str + url_end
    # Make the request
    response = requests.get(full_url)

    # Only proceed if the response status code is 200 (HTTP OK)
    if response.status_code == 200:
        # Convert the response to json
        data = response.json()

        # Check to see if the response is valid. If it is, parse the game data
        if 'Contests' in data and data['Contests']:  # Check if 'Contests' key exists and its value is not empty
            for game in data['Contests']:
                parsed_data.append(parse_game_data(data, game))

    return parsed_data


In [5]:
import pandas as pd
from tqdm import tqdm
import time
from datetime import datetime

## Target years to scrape
# years = [2022, 2021, 2020, 2019, 2018, 2017,  2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009 ]
years = [2023]
# reverse the list
# years.reverse()

## url builder
base_url = 'https://my.mhsaa.com/DesktopModules/MHSAA-Async-SportTeamSchedule/Endpoint.ashx?&method=schedules&orgID='
mid_url = '&sportTypeCode=BA&gender=M&level=V&year='
end_url = '&userid=-1'

## lIST OF SCHOOL IDS
valid_school_ids




# Initialize a timer for the total execution time
total_start_time = time.time()

# Create a dictionary to store dataframes
df_dict = {}

# Loop over the years
for year in years:
    # Initialize a timer for the year's execution time
    year_start_time = time.time()

    parsed_data = []

    # Use tqdm to create a progress bar for the school_ids
    for school_id in tqdm(valid_school_ids, desc=f'Processing Year: {year}', unit='school'):
        # Append the parsing for the given year and school ID
        try:
            parsed_data += initialize_parsing(base_url, school_id, mid_url + str(year) + end_url)
        except Exception as e:
            print(f"An error occurred with school_id: {school_id}. Error details: {str(e)}")
            continue

    # store the parsed data as DataFrame
    df = pd.DataFrame(parsed_data)

    # Store a copy of the DataFrame in the dictionary
    df_dict[year] = df.copy()
   
    # Generate timestamp
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    # Save the DataFrame to a CSV file in /TEMP/ directory, using year+1 in filename and current timestamp
    df.to_csv(f"TEMP/clean_tables/_NEW_SCRAPE_{year+1}.csv", index=False)

    # Print the year's execution time in minutes and seconds
    print(f"Year {year+1} took {int((time.time() - year_start_time)/60)} minutes and {int((time.time() - year_start_time)%60)} seconds")
    
    # print the number of valid records for the year
    print(f"Number of valid records: {len(df[df['teamName'].notnull()])}")
    # print total number of contests for the year
    print(f"Total number of contests: {len(df)}")

    ## 

# Print the total execution time
print(f"Total time taken: {int((time.time() - total_start_time)/60)} minutes and {int((time.time() - total_start_time)%60)} seconds")


Processing Year: 2022: 100%|██████████| 1027/1027 [08:18<00:00,  2.06school/s]


Year 2023 took 8 minutes and 18 seconds
Number of valid records: 22150
Total number of contests: 22150


Processing Year: 2021: 100%|██████████| 1027/1027 [07:31<00:00,  2.27school/s]


Year 2022 took 7 minutes and 32 seconds
Number of valid records: 21119
Total number of contests: 21119


Processing Year: 2020: 100%|██████████| 1027/1027 [07:28<00:00,  2.29school/s]


Year 2021 took 7 minutes and 28 seconds
Number of valid records: 18384
Total number of contests: 18384


Processing Year: 2019: 100%|██████████| 1027/1027 [06:13<00:00,  2.75school/s]


Year 2020 took 6 minutes and 14 seconds
Number of valid records: 12686
Total number of contests: 12686


Processing Year: 2018: 100%|██████████| 1027/1027 [07:13<00:00,  2.37school/s]


Year 2019 took 7 minutes and 14 seconds
Number of valid records: 19195
Total number of contests: 19195


Processing Year: 2017: 100%|██████████| 1027/1027 [07:22<00:00,  2.32school/s]


Year 2018 took 7 minutes and 22 seconds
Number of valid records: 18198
Total number of contests: 18198


Processing Year: 2016: 100%|██████████| 1027/1027 [07:18<00:00,  2.34school/s]


Year 2017 took 7 minutes and 18 seconds
Number of valid records: 18828
Total number of contests: 18828


Processing Year: 2015: 100%|██████████| 1027/1027 [07:03<00:00,  2.43school/s]


Year 2016 took 7 minutes and 3 seconds
Number of valid records: 16945
Total number of contests: 16945


Processing Year: 2014: 100%|██████████| 1027/1027 [07:26<00:00,  2.30school/s]


Year 2015 took 7 minutes and 27 seconds
Number of valid records: 17967
Total number of contests: 17967


Processing Year: 2013: 100%|██████████| 1027/1027 [06:55<00:00,  2.47school/s]


Year 2014 took 6 minutes and 55 seconds
Number of valid records: 17203
Total number of contests: 17203


Processing Year: 2012: 100%|██████████| 1027/1027 [06:34<00:00,  2.60school/s]


Year 2013 took 6 minutes and 35 seconds
Number of valid records: 14334
Total number of contests: 14334


Processing Year: 2011: 100%|██████████| 1027/1027 [06:24<00:00,  2.67school/s]


Year 2012 took 6 minutes and 24 seconds
Number of valid records: 11902
Total number of contests: 11902


Processing Year: 2010: 100%|██████████| 1027/1027 [06:07<00:00,  2.80school/s]


Year 2011 took 6 minutes and 7 seconds
Number of valid records: 7410
Total number of contests: 7410


Processing Year: 2009: 100%|██████████| 1027/1027 [05:42<00:00,  2.99school/s]

Year 2010 took 5 minutes and 43 seconds
Number of valid records: 4830
Total number of contests: 4830
Total time taken: 97 minutes and 45 seconds



