# Notebook to scrape MI High School Baseball school and conference info to integrate with the field measurements

In [1]:
## Dependencies: requests, bs4

import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen

In [2]:
## Web Addresses

top_page_url = 'https://en.wikipedia.org/wiki/List_of_Michigan_High_School_Association_member_conferences'

top_html = urlopen(top_page_url)

soup = BeautifulSoup(top_html, 'html.parser')

h2 = soup.find_all('h2')
line = soup.find_all('li')


### Get table of all conferences with names and links to conference pages

conferences = []
links = []
names = []

for each in line:
    
    if each.find('a'):
        
        names.append(each.find('a').get_text())
        links.append(each.find('a').get('href'))

# Convert to dataframe
df = pd.DataFrame({'name': names, 'link': links})

### Clean up dataframe

## If link doesn't start with /wiki/ then it's not a conference page, remove these rows

df = df[df['link'].str.contains('/wiki/')]

## Reindex dataframe

df = df.reset_index(drop=True)


### Drop all rows that don't have a conference name
## this is all of the rows after the 44th row

df = df.drop(df.index[45:])

# df.head(20)


In [3]:
### Use the list of conferences and relative links to get a table of all the teams in the conference

## Link to conference pages will be built like this

## https://en.wikipedia.org + link

## Example: https://en.wikipedia.org/wiki/Big_Sky_Conference

conference_html = []
conference_url = []
dfs = []

for each in range(len(df)):
    try:
        
        link = df['link'][each]
        
        conference_url = 'https://en.wikipedia.org' + link
        
        conference_name = df['name'][each]

        conference_html = urlopen(conference_url)

        soup = BeautifulSoup(conference_html, 'html.parser')

        table = soup.find('table',{'class':'wikitable'})

        ## create a dataframe from the table with the name of the conference added as a column

        df2 = pd.read_html(str(table))

        df2 = pd.DataFrame(df2[0])

        df2['conference'] = conference_name

        ## append the dataframe to a list

        dfs.append(df2)
    ### print out the conference name if there is an error
    except:
        print
        continue


### Concatenate all the dataframes in the list into one dataframe

df = pd.concat(dfs)

df.info()
    




<class 'pandas.core.frame.DataFrame'>
Int64Index: 696 entries, 0 to 20
Data columns (total 90 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   School                                        308 non-null    object 
 1   Mascot                                        205 non-null    object 
 2   Colors                                        73 non-null     object 
 3   Class (enrollment)                            6 non-null      object 
 4   City                                          6 non-null      object 
 5   County                                        186 non-null    object 
 6   Year joined                                   6 non-null      float64
 7   Previous league                               16 non-null     object 
 8   conference                                    480 non-null    object 
 9   (Team, Blue Division)                         42 non-null     obje

## Wednesday 12-15 work below


In [None]:
### Output csv to review the data

df.to_csv('TEMP/mhsaa_conferences.csv', index=False)

In [4]:
### open the manually cleaned conference list and clean up school names to make them easier to match

test_df = pd.read_csv('TEMP/temp_cleaned_conferences.csv')

test_df.info()

FileNotFoundError: [Errno 2] No such file or directory: 'TEMP/temp_cleaned_conferences.csv'

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### test scrape of the CAAC conference

### download the page

test_url = 'https://en.wikipedia.org/wiki/Capital_Area_Activities_Conference'

soup = (BeautifulSoup(test_url, 'html.parser'))

table_class='wikitable sortable jquery-tablesorter'

response = requests.get(test_url)
print(response.status_code)


In [None]:
#### Parse the data from html

soup = BeautifulSoup(response.text, 'html.parser')
table_1 = soup.find('table',{'class':'wikitable'})
table_1

In [None]:
### read the html into a dataframe

df = pd.read_html(str(table_1))
blue_df = pd.DataFrame(df[0])
blue_df.info()

## Output the data to a csv file
blue_df.to_csv('TEMP/blue.csv')

In [None]:
### Test scrape the Catholic High School League

test_url = 'https://en.wikipedia.org/wiki/Catholic_High_School_League'

response = requests.get(test_url)
soup = BeautifulSoup(response.text, 'html.parser')


table = soup.find('table',{'class':'wikitable'})

df = pd.read_html(str(table))
catholic_df = pd.DataFrame(df[0])
catholic_df.info()

## Output to check data
catholic_df.to_csv('TEMP/catholic.csv')