In [1]:
## Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
from area import area

# ETL Notebook
## New notebook to continue to streamline ETL of baseball field data

### incorporate the working parts of the kml parse book
* leverage the better organization and tagging of the folders to populate the level field in the resulting dataframe
    * should be able to break fields down into following categories based on folder name along
        * high_school (HS or high school in name)
        * youth (youth)
        * college (college)
        * pro (pro)
        * muni (muni) - for municipally owned or public park fields that don't have an active pro team
        * MLB (mlb) - just 3 parrks for now. comerica, coors, and fenway
        

In [2]:
#### Load data from kml file exported by Google Earth

file_path = ('data/kml/ballparks.kml')

with open(file_path) as file:

    xml_data = file.read()



# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')

folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')
# layers = soup.Document.Folder.Folder
# polygons = soup.Document.Folder.Placemark.Polygon

In [3]:
## Create a dataframe to hold the data parsed from xml
df = pd.DataFrame(columns=['field', 'foul', 'fop'])


## Loop through the folders and extract the data
i = 0   

for i in range(len(list)):

    folders = list[i]
    field_name = folders.find('name').text
    foul = folders.find_all('coordinates')[0].text
    fop = folders.find_all('coordinates')[1].text

    row = {
        'field': field_name,
        'foul': foul,
        'fop': fop
    }

    i+=1

    df = df.append(row, ignore_index=True)

  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append

In [4]:

## Cleaning
# remove new line and and space characters from coordinates
df = df.replace(r'\n','', regex=True) 
df = df.replace(r'\t','', regex=True) 

## Drop any duplicate rows
df = df.drop_duplicates(subset=['field'], keep='first')

## Drop and rows with empty fields
df = df[(df != 0).all(1)]

# ## remove any numberic characters and . from field names
# # There shouldn't be any anymore because of the validation I did before exporting kml
# df['field'] = df['field'].str.replace(r'\d+', '')

In [5]:
## Parse field names to get level column using regex
import re
re_mlb = re.compile(r'mlb', re.IGNORECASE)
re_pro = re.compile(r'pro', re.IGNORECASE)
re_college = re.compile(r'college', re.IGNORECASE)
re_youth = re.compile(r'youth', re.IGNORECASE)
re_muni = re.compile(r'muni', re.IGNORECASE)

df['level'] = df['field'].apply(lambda x: 'mlb' 
        if re_mlb.search(x) else 'pro' 
        if re_pro.search(x) else 'college' 
        if re_college.search(x) else 'youth' 
        if re_youth.search(x) else 'muni' 
        if re_muni.search(x) else 'high_school')

# clean up the field names
# remove the level from the field name
df['field'] = df['field'].str.replace(r'MLB', '')
df['field'] = df['field'].str.replace(r'pro', '')
df['field'] = df['field'].str.replace(r'college', '')
# remove - from end of field name
df['field'] = df['field'].str.replace(r'- $', '')

## Output test csv
# df.to_csv('TEMP/level2_tost.csv', index=False)

  df['field'] = df['field'].str.replace(r'- $', '')


In [6]:
## Transform DATA
# taken from clean_notebook_parse

## Create a (lat, long) pair for home plate 
df['home_plate'] = df['foul'].str.split(' ').str[0]

## Apply lambda function to format polygon coordinates and pass to the area function
# #OPutput in square meters
df['foul_area'] = df['foul'].apply(lambda x: area({'type': 'Polygon', 'coordinates': [[tuple(map(float, coord.split(','))) for coord in x.split()]]}))
df['fop_area'] = df['fop'].apply(lambda x: area({'type': 'Polygon', 'coordinates': [[tuple(map(float, coord.split(','))) for coord in x.split()]]}))

# Convert the area to square feet
df['foul_ft'] = df['foul_area'].apply(lambda x: x*10.7639)
df['fop_ft'] = df['fop_area'].apply(lambda x: x*10.7639)
# round the area to an integer
df['foul_ft'] = df['foul_ft'].round(0)
df['fop_ft'] = df['fop_ft'].round(0)

# drop the square meter area columns
df = df.drop(['foul_area', 'fop_area'], axis=1)

# drop the polygon coordinates
df = df.drop(['foul', 'fop'], axis=1)

# calculate the ratio of foul ground to total area and convert to percentage
df['foul_pct'] = (df['foul_ft']/(df['fop_ft']+df['foul_ft']))*100
# round the percentage to 2 decimal places
df['foul_pct'] = df['foul_pct'].round(2)

### Split the home_plate coordinates into lat and long columns
df['h_lon'] = df['home_plate'].str.split(',').str[0]
df['h_lat'] = df['home_plate'].str.split(',').str[1]

## Drop the home_plate column
df = df.drop(['home_plate'], axis=1)

# # Outputs a clean CSV with area clean names, correct levels and calculations
# df.to_csv('TEMP/tost_with_area.csv', index=False)





## Working Above - can consol

### Adding fuzzy matching blocks to connect to enrollment data, ect

In [7]:

### Read the enrollment table from MHSAA website - 2022 enrollment
mhsaa_df = pd.read_excel('data\collected.xlsx', sheet_name='2022_enrollment')

# rename the columns
mhsaa_df.rename(columns={'enrollment_total':'students',
       'enrollment_classification':'enrollment'}, inplace=True)


# select just the high school fields
hs_df = df[df['level'] == 'high_school']
other_df = df[df['level'] != 'high_school']


# ## export the mhsaa_df to csv
# mhsaa_df.to_csv('TEMP/mhsaa_df.csv', index=False)




In [8]:
mhsaa_df.head()

Unnamed: 0,school_id,school_name,students,enrollment,division,level
0,9448,Macomb - Dakota HS,2876,2876,A,high_school
1,2237,Grand Blanc HS,2812,2812,A,high_school
2,5792,East Kentwood HS,2678,2678,A,high_school
3,6980,Dearborn - Fordson HS,2613,2613,A,high_school
4,3020,Shelby Township - Utica Eisenhower HS,2555,2555,A,high_school


In [8]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

hs_names = hs_df['field'].tolist()
enroll_names = mhsaa_df['school_name'].tolist()
id_list = mhsaa_df['school_id'].tolist()

# set the treshold for the fuzzy match
treshold = 90
mat1 = []


# output a list of the school_id from the mhsaa_df based on the fuzzy match
for i in hs_names:
    mat1.append(process.extract(i, enroll_names, limit=1, scorer=fuzz.token_set_ratio))

hs_df['match'] = mat1
# mhsaa_df['match'] = mat1

# Insert the ammended high school data frame (the one that I manuall confirmed the matches)
hs_df = pd.read_csv('data\school_info\match_df_manual_check.csv')


## Get the school name from the match column
hs_df['match'] = hs_df['match'].str.split(',').str[0]
# remove the single quotes from the match column
hs_df['match'] = hs_df['match'].str.replace("'", "")
# remove [ and ( from the match column
hs_df['match'] = hs_df['match'].str.replace("[", "")
hs_df['match'] = hs_df['match'].str.replace("(", "")

print(hs_df['match'])

# create new dataframe by merging the two dataframes
new_df = pd.merge(hs_df, mhsaa_df, left_on='match', right_on='school_name', how='left')

# new_df.head()

0           Carleton - Airport HS
1          Livonia - Stevenson HS
2                      Algonac HS
3                   Allen Park HS
4                             NaN
                  ...            
150    Ottawa Lake - Whiteford HS
151                  Whitehall HS
152              Whitmore Lake HS
153        Whittemore-Prescott HS
154                Williamston HS
Name: match, Length: 155, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hs_df['match'] = mat1
  hs_df['match'] = hs_df['match'].str.replace("[", "")
  hs_df['match'] = hs_df['match'].str.replace("(", "")


In [9]:
### Add the non high schools back to the dataframe

df = new_df.append(other_df, ignore_index=True)

  df = new_df.append(other_df, ignore_index=True)


In [10]:
### Output new_df to csv

df.to_csv('data/viz/viz_book_area.csv', index=False)


### A few small problems - Midland Bullock Creek and Grosse Point South didn't import
### the grand Ledge Youth Fields and Moose lodge got included in the high school fields

# The columns are messed up, multiple columns for the level data, ect
## want to leave field name as it is but use the match column as 'team_name'"geometry workbook.ipynb"
# ## populate 'team name' coulmn with values for non high schools
