In [1]:
## Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
from area import area

# ETL Notebook
## New notebook to continue to streamline ETL of baseball field data

### incorporate the working parts of the kml parse book
* leverage the better organization and tagging of the folders to populate the level field in the resulting dataframe
    * should be able to break fields down into following categories based on folder name along
        * high_school (HS or high school in name)
        * youth (youth)
        * college (college)
        * pro (pro)
        * muni (muni) - for municipally owned or public park fields that don't have an active pro team
        * MLB (mlb) - just 3 parrks for now. comerica, coors, and fenway
        

In [2]:
#### Load data from kml file exported by Google Earth

file_path = ('data\kml\My Places_dec_9_22.kml')

with open(file_path) as file:

    xml_data = file.read()



# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')

folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')
layers = soup.Document.Folder.Folder
polygons = soup.Document.Folder.Placemark.Polygon

In [3]:
## Create a dataframe to hold the data parsed from xml
df = pd.DataFrame(columns=['field', 'foul', 'fop'])


## Loop through the folders and extract the data
i = 0   

for i in range(len(list)):

    folders = list[i]
    field_name = folders.find('name').text
    foul = folders.find_all('coordinates')[0].text
    fop = folders.find_all('coordinates')[1].text

    row = {
        'field': field_name,
        'foul': foul,
        'fop': fop
    }

    i+=1

    df = df.append(row, ignore_index=True)

  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append

In [4]:

## Cleaning
# remove new line and and space characters from coordinates
df = df.replace(r'\n','', regex=True) 
df = df.replace(r'\t','', regex=True) 

## Drop any duplicate rows
df = df.drop_duplicates(subset=['field'], keep='first')

## Drop and rows with empty fields
df = df[(df != 0).all(1)]

# ## remove any numberic characters and . from field names
# # There shouldn't be any anymore because of the validation I did before exporting kml
# df['field'] = df['field'].str.replace(r'\d+', '')

In [5]:
## Parse field names to get level column using regex
import re
re_mlb = re.compile(r'mlb', re.IGNORECASE)
re_pro = re.compile(r'pro', re.IGNORECASE)
re_college = re.compile(r'college', re.IGNORECASE)
re_youth = re.compile(r'youth', re.IGNORECASE)
re_muni = re.compile(r'muni', re.IGNORECASE)

df['level'] = df['field'].apply(lambda x: 'mlb' 
        if re_mlb.search(x) else 'pro' 
        if re_pro.search(x) else 'college' 
        if re_college.search(x) else 'youth' 
        if re_youth.search(x) else 'muni' 
        if re_muni.search(x) else 'high_school')

# clean up the field names
# remove the level from the field name
df['field'] = df['field'].str.replace(r'MLB', '')
df['field'] = df['field'].str.replace(r'pro', '')
df['field'] = df['field'].str.replace(r'college', '')
# remove - from end of field name
df['field'] = df['field'].str.replace(r'- $', '')

# ## Output test csv
# df.to_csv('TEMP/test_2.csv', index=False)

  df['field'] = df['field'].str.replace(r'- $', '')


In [6]:
## Transform DATA
# taken from clean_notebook_parse

## Create a (lat, long) pair for home plate 
df['home_plate'] = df['foul'].str.split(' ').str[0]

## Apply lambda function to format polygon coordinates and pass to the area function
# #OPutput in square meters
df['foul_area'] = df['foul'].apply(lambda x: area({'type': 'Polygon', 'coordinates': [[tuple(map(float, coord.split(','))) for coord in x.split()]]}))
df['fop_area'] = df['fop'].apply(lambda x: area({'type': 'Polygon', 'coordinates': [[tuple(map(float, coord.split(','))) for coord in x.split()]]}))

# Convert the area to square feet
df['foul_ft'] = df['foul_area'].apply(lambda x: x*10.7639)
df['fop_ft'] = df['fop_area'].apply(lambda x: x*10.7639)
# round the area to an integer
df['foul_ft'] = df['foul_ft'].round(0)
df['fop_ft'] = df['fop_ft'].round(0)

# drop the square meter area columns
df = df.drop(['foul_area', 'fop_area'], axis=1)

# drop the polygon coordinates
df = df.drop(['foul', 'fop'], axis=1)

# calculate the ratio of foul ground to total area and convert to percentage
df['foul_pct'] = (df['foul_ft']/(df['fop_ft']+df['foul_ft']))*100
# round the percentage to 2 decimal places
df['foul_pct'] = df['foul_pct'].round(2)

### Split the home_plate coordinates into lat and long columns
df['h_lat'] = df['home_plate'].str.split(',').str[0]
df['h_long'] = df['home_plate'].str.split(',').str[1]

## Drop the home_plate column
df = df.drop(['home_plate'], axis=1)

# # Outputs a clean CSV with area clean names, correct levels and calculations
# df.to_csv('TEMP/tost.csv', index=False)





## Working Above - can consol

### next thing to add is fuzzy matching link to the enrollment data

In [7]:
### Output dataframe to use in tableau
df.to_csv('data/tab_book.csv', index=False)

df.head()

Unnamed: 0,field,level,foul_ft,fop_ft,foul_pct,h_lat,h_long
0,Adrian College,college,14280.0,106000.0,11.87,-84.0697145,41.901861
1,Airport High School - Carleton,high_school,21262.0,88074.0,19.45,-83.3765218,42.0373919
2,Aldai Stevenson High School,high_school,20786.0,88749.0,18.98,-83.0144563,42.5871203
4,Algonac High School,high_school,30427.0,83362.0,26.74,-82.5823976,42.6286202
5,Allen Park High School,high_school,21933.0,94655.0,18.81,-83.2273711,42.2455509
