In [1]:
## Notebook to parse MHSAA Interactive brackets (Basketball)

# want to return a the basketball team names, IDs, and division assignment of teams
# active in the current year

# Dependencies
import pandas as pd
import os
import requests
import bs4 as BeautifulSoup
from collections import OrderedDict



In [2]:
## URLS

# div_1_example = 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/9/SportSeasonId/421658/Classification/1'



url_1 = 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/'

# Spot 1 will be 1-8 to get all districts
dist_num = range(1,9)

url_2 = '/SportSeasonId/421658/Classification/'

## Division numbers will be 1-4
div_num = range(1,5)


## Create a list of urls to iterate through
url_list = []

for dist in dist_num:
    for div in div_num:
        url_list.append(url_1 + str(dist) + url_2 + str(div))

print(url_list)

len(url_list)

['https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/1/SportSeasonId/421658/Classification/1', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/1/SportSeasonId/421658/Classification/2', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/1/SportSeasonId/421658/Classification/3', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/1/SportSeasonId/421658/Classification/4', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/2/SportSeasonId/421658/Classification/1', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/2/SportSeasonId/421658/Classification/2', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/2/SportSeasonId/421658/Classification/3', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/2/SportSeasonId/421658/Classification/4', 'https://my.mhsaa.com/Sports/MHSAA-Tournament-Brackets/BracketGroup/3/SportSeasonId/421658/Classification/1', 'https://

32

In [3]:
## Create a function to go through each url and return the team names and division

def get_team_names(url):
    response = requests.get(url)
    soup = BeautifulSoup.BeautifulSoup(response.text, 'html.parser')
    school_names_elements = soup.find_all('span', class_='tournschoolhov')

    # create a list of cleaned school names
    school_names = [name.contents[0].strip() for name in school_names_elements if name.contents[0].strip() != ""]

    # remove duplicates by using an OrderedDict, then convert the keys back to a list
    unique_school_names = list(OrderedDict.fromkeys(school_names))

    # create a dataframe of the school names - assign the division based on the url
    df = pd.DataFrame(unique_school_names, columns=['School Name'])
    df['Division'] = url[-1]

    return df

## run the function on the list of urls
df_list = [get_team_names(url) for url in url_list]

## concatenate the list of dataframes into one dataframe
df = pd.concat(df_list)

## reset the index
df.reset_index(drop=True, inplace=True)



In [4]:
## Display the dataframe
df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   School Name  710 non-null    object
 1   Division     710 non-null    object
dtypes: object(2)
memory usage: 11.2+ KB


Unnamed: 0,School Name,Division
0,South Lyon East,1
1,South Lyon,1
2,Ann Arbor Skyline,1
3,Dexter,1
4,Brighton,1


In [5]:
### Division value counts
df['Division'].value_counts()

1    180
3    180
2    175
4    175
Name: Division, dtype: int64

In [6]:
### Save the dataframe to a csv for future use
df.to_csv('mhsaa_basketball_school_list_2023.csv', index=False)

In [7]:
## Get a list of all span classes
# span_list = soup.find_all('span')

## View the span list
# span_list




In [8]:
## Looking for these elements
# <span class="tournschoolhov">

# school_names = soup.find_all('span', class_='tournschoolhov')

# for i in range(len(school_names)):
#     print(school_names[i].contents)


## View the tournschoolhov list
# school_names

## clean up the list
# drop any special characters


