In [1]:
### Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re
import time

In [2]:
## Start timer
start_time = time.time()



### This notebook is for extracting the data from the original kml file and outputing a JSON that will be easy to use with Google Maps API



In [3]:
## LOAD BLOCK###
#### Load data from kml file exported from Google Earth

file_path = ('data/kml/ballparks.kml') # file path to kml file

with open(file_path) as file:

    xml_data = file.read()



# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')
folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')

## Create a dataframe to hold the data parsed from xml
df = pd.DataFrame(columns=['field', 'foul', 'fop'])

failed = []


In [4]:
#### EXTRACTION BLOCK ####
#### Extract data from kml file

# Create an empty list to store the rows to append to the DataFrame
rows = []

# Loop through the folders and extract the data
for folder in list:
    try:
        field_name = folder.find('name').text
        foul = folder.find_all('coordinates')[0].text
        fop = folder.find_all('coordinates')[1].text

        row = {
            'field': field_name,
            'foul': foul,
            'fop': fop
        }

        rows.append(row)

    except Exception as e:
        # Add name of folder to a list of failed folders
        failed.append(folder.find('name').text)
        print(f"Error processing folder: {folder.find('name').text}. Error message: {str(e)}")

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows)

# Print a list of failed folders
print(f"Failed to process {len(failed)} folders: {', '.join(failed)}")


Error processing folder: Progressive Field - Cleveland - MLB. Error message: list index out of range
Failed to process 1 folders: Progressive Field - Cleveland - MLB


In [5]:
# Define a dictionary that maps level indicators to levels and size factors
level_dict = {
    'International': 'international',
    'Major Leagues': 'mlb', 
    'Professional': 'pro', 
    'College': 'college', 
    'High School': 'high_school',
    'Youth': 'youth',
    
}

In [6]:
# Create a copy of the original DataFrame
df_cleaned = df.copy()

# Remove new line and space characters from coordinates
df_cleaned = df_cleaned.replace(r'\n','', regex=True) 
df_cleaned = df_cleaned.replace(r'\t','', regex=True) 

# Drop any duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=['field'], keep='first')

# Drop any rows with empty fields
df_cleaned = df_cleaned[(df_cleaned != 0).all(1)]

# Define the regex patterns for each level
re_mlb = re.compile(r'mlb', re.IGNORECASE)
re_pro = re.compile(r'pro|semi[-\s]*pro', re.IGNORECASE)
re_college = re.compile(r'college', re.IGNORECASE)
re_high_school = re.compile(r'high school|hs', re.IGNORECASE)  # Include the abbreviation 'hs'
re_youth = re.compile(r'youth', re.IGNORECASE)
re_muni = re.compile(r'muni', re.IGNORECASE)
re_international = re.compile(r'international', re.IGNORECASE)

# Define a function to classify the fields based on the regex patterns
def classify_field(field_name):
    if re_mlb.search(field_name):
        return 'Major League'
    elif re_pro.search(field_name):
        return 'Professional'
    elif re_college.search(field_name):
        return 'College'
    elif re_high_school.search(field_name):
        return 'High School'
    elif re_youth.search(field_name):
        return 'Youth'
    elif re_muni.search(field_name):
        return 'State / County / Municipal'
    elif re_international.search(field_name):
        return 'International'
    else:
        return 'Unknown'

# Apply the classify_field function to the 'field' column
df_cleaned['level'] = df_cleaned['field'].apply(classify_field)

# Clean up the 'field' column by removing the level indicator and any trailing '-' characters
level_regex = r'\s*(%s)\s*' % '|'.join(re.escape(level) for level in level_dict.values())
df_cleaned['field'] = df_cleaned['field'].str.replace(level_regex, '', regex=True, flags=re.IGNORECASE)
df_cleaned['field'] = df_cleaned['field'].str.replace(r'-\s*$', '', regex=True)

# Rename field column to park_name to avoid confusion down the line
df_cleaned = df_cleaned.rename(columns={'field': 'park_name'})



In [7]:
df_cleaned.head()

## print a list of all the values in the level column
# print(df_cleaned['level'].unique())

## Print the two the headers and two rows of data to a txt file for reference


# with open('data/rows.txt', 'w') as f:
#     f.write(df_cleaned.iloc[0].to_string())
#     f.write(df_cleaned.iloc[1].to_string())
#     f.write(df_cleaned.iloc[14].to_string())
    

# print(df_cleaned.iloc[15].to_string())


Unnamed: 0,park_name,foul,fop,level
0,Ann Arbor Skyline HS,"-83.77546719999999,42.304163,0 -83.7755027,42....","-83.77546719999999,42.304163,0 -83.7743031,42....",High School
1,Bellaire High School,"-85.1912578,44.9766363,0 -85.19023989999999,44...","-85.1912578,44.9766363,0 -85.1906288,44.975944...",High School
2,Benton Harbor HS - high_scool,"-86.4624276,42.1020796,0 -86.4622586,42.102980...","-86.4624276,42.1020796,0 -86.4612434,42.101929...",High School
3,Berrien Springs HS,"-86.34676399999999,41.9452802,0 -86.3466903000...","-86.34676399999999,41.9452802,0 -86.3479629999...",High School
4,Brooklyn Columbia Central HS,"-84.28050330000001,42.0874482,0 -84.2811256,42...","-84.28050330000001,42.0874482,0 -84.2795833,42...",High School


In [8]:
##### Clean up polygon data and create a new home_plate column

def parse_coordinates(coord_string):
    coords = coord_string.split()
    parsed_coords = [tuple(map(float, coord.split(',')[:2])) for coord in coords]
    return parsed_coords

# Create a new column for the home_plate location using the first set of coordinates in the 'fop' column
df_cleaned['home_plate'] = df_cleaned['fop'].apply(lambda x: parse_coordinates(x)[0])

# Apply the parse_coordinates function to the 'foul' and 'fop' columns
df_cleaned['foul'] = df_cleaned['foul'].apply(parse_coordinates)
df_cleaned['fop'] = df_cleaned['fop'].apply(parse_coordinates)


In [9]:
## Doesn't seem to be returning useful data - will need to revisit

# def determine_direction(coordinates):
#     num_points = len(coordinates)
#     if num_points < 2:
#         return "Not enough points"

#     # Get the latitude (y-coordinate) values
#     latitudes = [point[1] for point in coordinates]

#     # Check the change in latitude values
#     increasing_latitudes = all(latitudes[i] <= latitudes[i+1] for i in range(num_points-1))
#     decreasing_latitudes = all(latitudes[i] >= latitudes[i+1] for i in range(num_points-1))

#     if increasing_latitudes:
#         return "Left Field"
#     elif decreasing_latitudes:
#         return "Right Field"
#     else:
#         return "Collinear"

# # Apply the determine_direction function to the 'fop' and 'foul' columns
# df_cleaned['fop_direction'] = df_cleaned['fop'].apply(determine_direction)
# df_cleaned['foul_direction'] = df_cleaned['foul'].apply(determine_direction)

# # Print the updated DataFrame
# print(df_cleaned)



In [10]:
df_cleaned.head()

## Value counts for the fop_direction column
df_cleaned['level'].value_counts()

## Value counts for the foul_direction column
# df_cleaned['foul_direction'].value_counts()

df_cleaned.head()




Unnamed: 0,park_name,foul,fop,level,home_plate
0,Ann Arbor Skyline HS,"[(-83.7754672, 42.304163), (-83.7755027, 42.30...","[(-83.7754672, 42.304163), (-83.7743031, 42.30...",High School,"(-83.7754672, 42.304163)"
1,Bellaire High School,"[(-85.1912578, 44.9766363), (-85.1902399, 44.9...","[(-85.1912578, 44.9766363), (-85.1906288, 44.9...",High School,"(-85.1912578, 44.9766363)"
2,Benton Harbor HS - high_scool,"[(-86.4624276, 42.1020796), (-86.4622586, 42.1...","[(-86.4624276, 42.1020796), (-86.4612434, 42.1...",High School,"(-86.4624276, 42.1020796)"
3,Berrien Springs HS,"[(-86.346764, 41.9452802), (-86.3466903, 41.94...","[(-86.346764, 41.9452802), (-86.347963, 41.945...",High School,"(-86.346764, 41.9452802)"
4,Brooklyn Columbia Central HS,"[(-84.2805033, 42.0874482), (-84.2811256, 42.0...","[(-84.2805033, 42.0874482), (-84.2795833, 42.0...",High School,"(-84.2805033, 42.0874482)"


In [11]:
import pyproj
from shapely.geometry import Polygon
from shapely.ops import transform


def calculate_area(coords):
    # Create a Polygon object from the coordinates
    polygon = Polygon(coords)

    # Calculate the centroid of the polygon
    centroid = polygon.centroid

    # Create a custom LAEA projection centered on the centroid
    custom_projection = f"+proj=laea +lat_0={centroid.y} +lon_0={centroid.x} +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

    # Create a transformer for converting coordinates to the custom LAEA projection
    transformer = pyproj.Transformer.from_crs(
        pyproj.CRS("EPSG:4326"),  # WGS 84 (latitude and longitude)
        pyproj.CRS(custom_projection),  # Custom LAEA projection
        always_xy=True
    )

    # Define a function to transform coordinates using the transformer
    def transform_coordinates(x, y):
        return transformer.transform(x, y)

    # Convert the coordinates to the custom LAEA projection
    polygon_laea = transform(transform_coordinates, polygon)

    # Calculate the area in square meters
    area_sqm = polygon_laea.area

    # Convert the area to square feet (1 square meter = 10.764 square feet)
    area_sqft = area_sqm * 10.764

    return area_sqft



### Call Function and add to dataframe
df_cleaned['foul_area_sqft'] = df_cleaned['foul'].apply(calculate_area)
df_cleaned['fop_area_sqft'] = df_cleaned['fop'].apply(calculate_area)

## Calculate the total area of the field and the ratio of foul area to field area
df_cleaned['field_area_sqft'] = df_cleaned['foul_area_sqft'] + df_cleaned['fop_area_sqft']
## Percentage foul area
df_cleaned['foul_area_per'] = df_cleaned['foul_area_sqft'] / df_cleaned['field_area_sqft']
## Fair to Foul Ratio
df_cleaned['fair_to_foul'] = df_cleaned['fop_area_sqft'] / df_cleaned['foul_area_sqft']


In [12]:
from geopy.distance import great_circle
import numpy as np

def interpolate_points(start, end, length_ratio):
    start_np = np.array(start)
    end_np = np.array(end)
    return tuple(start_np + (end_np - start_np) * length_ratio)

def calculate_distances(home_plate, outfield_coords, num_points=30):
    def is_same_point(point1, point2, tolerance=1e-6):
        return abs(point1[0] - point2[0]) < tolerance and abs(point1[1] - point2[1]) < tolerance

    home_plate_lat_lon = (home_plate[1], home_plate[0])
    distances = []

    # Calculate total line length
    total_length = 0
    segments = []
    for i in range(len(outfield_coords) - 1):
        start = outfield_coords[i]
        end = outfield_coords[i + 1]
        if not is_same_point(home_plate, start) and not is_same_point(home_plate, end):
            segment_length = great_circle((start[1], start[0]), (end[1], end[0])).feet
            segments.append((start, end, segment_length))
            total_length += segment_length

    # Calculate the distance between equally spaced points
    spacing = total_length / (num_points - 1)

    # Interpolate points and calculate distances
    current_length = 0
    segment_index = 0
    for i in range(num_points):
        while segment_index < len(segments) - 1 and current_length > segments[segment_index][2]:
            current_length -= segments[segment_index][2]
            segment_index += 1

        start, end, segment_length = segments[segment_index]
        length_ratio = current_length / segment_length
        point = interpolate_points(start, end, length_ratio)
        distance = round(great_circle(home_plate_lat_lon, (point[1], point[0])).feet)
        distances.append(distance)

        current_length += spacing

    return distances

# Calculate distances for each row
df_cleaned['distances'] = df_cleaned.apply(lambda row: calculate_distances(row['home_plate'], row['fop']), axis=1)

# Calculate max, min, and average distances for each row
df_cleaned['max_distance'] = df_cleaned['distances'].apply(max)
df_cleaned['min_distance'] = df_cleaned['distances'].apply(min)
df_cleaned['avg_distance'] = df_cleaned['distances'].apply(lambda distances: sum(distances) / len(distances))
# get the median distance
df_cleaned['median_distance'] = df_cleaned['distances'].apply(lambda distances: np.median(distances))


In [13]:
######## CHECK BLOCK ########

## Check how long the distance list is for each row
df_cleaned['num_distances'] = df_cleaned['distances'].apply(len)

## Print the value counts for the 'num_distances' column
df_cleaned['num_distances'].value_counts()


30    716
Name: num_distances, dtype: int64

In [14]:
### Get Geolocation of each field based on home plate coordinates and return state and country
### This block takes a long time to run - will need to revisit
## up to ten minutes

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
from tqdm import tqdm

geolocator = Nominatim(user_agent="baseball_field_locator")

# Function to get location information
def get_location_info(lng, lat):
    try:
        location = geolocator.reverse((lat, lng), timeout=10)
        state = location.raw['address'].get('state', None)
        country = location.raw['address'].get('country', None)
        return state, country
    except GeocoderTimedOut:
        print(f"GeocoderTimedOut error for coordinates: ({lng}, {lat})")
        return None, None
    except GeocoderServiceError:
        print(f"GeocoderServiceError for coordinates: ({lng}, {lat})")
        return None, None

# Extract the first coordinate for each field
df_cleaned['lng'], df_cleaned['lat'] = zip(*df_cleaned['home_plate'].apply(lambda x: x))

# Wrap the DataFrame apply function with tqdm for progress indication
tqdm.pandas(desc="Processing coordinates")

# Get state and country information for each field
df_cleaned[['state', 'country']] = df_cleaned.progress_apply(lambda row: get_location_info(row['lng'], row['lat']), axis=1, result_type='expand')


Processing coordinates: 100%|██████████| 716/716 [05:58<00:00,  2.00it/s]


In [15]:
df = df_cleaned.copy()

In [16]:
# ### Create a dataframe of just high school fields in Michigan
# hs_df = df[df['level'] == 'High School']


### Create a dataframe of all other fields
other_df = df[df['level'] != 'High School']

## Create a dataframe that is just fields in Michigan
mi_df = df[df['state'] == 'Michigan']

## get rid of the fields youth fields
mi_df = mi_df[mi_df['level'] != 'Youth']





print(df.columns)

mi_df.info()

Index(['park_name', 'foul', 'fop', 'level', 'home_plate', 'foul_area_sqft',
       'fop_area_sqft', 'field_area_sqft', 'foul_area_per', 'fair_to_foul',
       'distances', 'max_distance', 'min_distance', 'avg_distance',
       'median_distance', 'num_distances', 'lng', 'lat', 'state', 'country'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536 entries, 0 to 720
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   park_name        536 non-null    object 
 1   foul             536 non-null    object 
 2   fop              536 non-null    object 
 3   level            536 non-null    object 
 4   home_plate       536 non-null    object 
 5   foul_area_sqft   536 non-null    float64
 6   fop_area_sqft    536 non-null    float64
 7   field_area_sqft  536 non-null    float64
 8   foul_area_per    536 non-null    float64
 9   fair_to_foul     536 non-null    float64
 10  distances        5

In [17]:
df_cleaned = mi_df.copy()

In [18]:
#### Calculate Ranks for each field
### Grouped by level

### NEED TO BE AFTER BLOCK ASSIGNING GEO LOCATIONS AND FILTERING BY STATE TO CREATE MHSAA SPECIFIC DF

def rank_fields(df):
    # Calculate the rank for each category
    df['max_distance_rank'] = df['max_distance'].rank(ascending=False, method='min')
    df['min_distance_rank'] = df['min_distance'].rank(ascending=False, method='min')
    df['avg_distance_rank'] = df['avg_distance'].rank(ascending=False, method='min')
    df['median_distance_rank'] = df['median_distance'].rank(ascending=False, method='min')
    df['field_area_rank'] = df['field_area_sqft'].rank(ascending=False, method='min')
    df['foul_area_rank'] = df['foul_area_sqft'].rank(ascending=False, method='min')
    df['fop_area_per_rank'] = df['fop_area_sqft'].rank(ascending=False, method='min')
    df['ratio_rank'] = df['fair_to_foul'].rank(ascending=False, method='min')

    return df

# Group the DataFrame by level and apply the rank_fields function to each group
df_ranked = df_cleaned.apply(rank_fields)

# Reset the index to get the original DataFrame structure
df_ranked.reset_index(drop=True, inplace=True)


In [19]:
## Rename df bnack to df_cleaned
df_cleaned = df_ranked

## Show samples of the data from each level

df_ranked[df_ranked['level'] == 'High School'].head(10)

# df_ranked.head()

Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,state,country,max_distance_rank,min_distance_rank,avg_distance_rank,median_distance_rank,field_area_rank,foul_area_rank,fop_area_per_rank,ratio_rank
0,Ann Arbor Skyline HS,"[(-83.7754672, 42.304163), (-83.7755027, 42.30...","[(-83.7754672, 42.304163), (-83.7743031, 42.30...",High School,"(-83.7754672, 42.304163)",34964.691149,92699.203934,127663.895083,0.273881,2.651223,...,Michigan,United States,180.0,147.0,152.0,122.0,86.0,73.0,160.0,393.0
1,Bellaire High School,"[(-85.1912578, 44.9766363), (-85.1902399, 44.9...","[(-85.1912578, 44.9766363), (-85.1906288, 44.9...",High School,"(-85.1912578, 44.9766363)",22921.180838,85543.834852,108465.01569,0.211323,3.732087,...,Michigan,United States,451.0,328.0,378.0,366.0,391.0,365.0,344.0,139.0
2,Benton Harbor HS - high_scool,"[(-86.4624276, 42.1020796), (-86.4622586, 42.1...","[(-86.4624276, 42.1020796), (-86.4612434, 42.1...",High School,"(-86.4624276, 42.1020796)",32035.581645,103562.743401,135598.325046,0.236254,3.232741,...,Michigan,United States,26.0,29.0,24.0,34.0,26.0,135.0,11.0,243.0
3,Berrien Springs HS,"[(-86.346764, 41.9452802), (-86.3466903, 41.94...","[(-86.346764, 41.9452802), (-86.347963, 41.945...",High School,"(-86.346764, 41.9452802)",26097.189734,90254.560017,116351.749751,0.224296,3.458401,...,Michigan,United States,126.0,295.0,194.0,235.0,266.0,289.0,219.0,191.0
4,Brooklyn Columbia Central HS,"[(-84.2805033, 42.0874482), (-84.2811256, 42.0...","[(-84.2805033, 42.0874482), (-84.2795833, 42.0...",High School,"(-84.2805033, 42.0874482)",28305.998957,80109.952635,108415.951592,0.261087,2.83014,...,Michigan,United States,245.0,399.0,428.0,455.0,393.0,230.0,448.0,348.0
5,Central Lake HS,"[(-85.2690937, 45.0671277), (-85.2679149, 45.0...","[(-85.2690937, 45.0671277), (-85.269142, 45.06...",High School,"(-85.2690937, 45.0671277)",17031.960373,75950.501772,92982.462144,0.183174,4.459293,...,Michigan,United States,472.0,407.0,455.0,463.0,472.0,460.0,466.0,52.0
6,Croswell-Lexington HS,"[(-82.5997748, 43.2674602), (-82.5998486, 43.2...","[(-82.5997748, 43.2674602), (-82.5985947, 43.2...",High School,"(-82.5997748, 43.2674602)",20877.921771,86121.164373,106999.086144,0.195122,4.124987,...,Michigan,United States,417.0,147.0,328.0,322.0,409.0,405.0,333.0,82.0
7,Durand HS - high school,"[(-83.9828578, 42.9168369), (-83.9829396, 42.9...","[(-83.9828578, 42.9168369), (-83.9817031, 42.9...",High School,"(-83.9828578, 42.9168369)",22506.415172,88814.942727,111321.3579,0.202175,3.946206,...,Michigan,United States,161.0,225.0,302.0,293.0,351.0,374.0,266.0,103.0
8,Eaton Rapids High School,"[(-84.6351432, 42.5118215), (-84.6360042, 42.5...","[(-84.6351432, 42.5118215), (-84.6343814, 42.5...",High School,"(-84.6351432, 42.5118215)",38639.457736,101086.884585,139726.342321,0.276537,2.616157,...,Michigan,United States,28.0,246.0,27.0,22.0,12.0,35.0,25.0,398.0
9,Powers Catholic HS,"[(-83.7055549, 43.0025635), (-83.7061516, 43.0...","[(-83.7055549, 43.0025635), (-83.7045396, 43.0...",High School,"(-83.7055549, 43.0025635)",28473.249984,97620.131234,126093.381218,0.225811,3.428486,...,Michigan,United States,136.0,147.0,63.0,39.0,106.0,227.0,62.0,201.0


In [20]:
#### Orienting the map to the home plate location ####

### Find the center of the field
def calculate_centroid(coords):
    x_coords = [coord[0] for coord in coords]
    y_coords = [coord[1] for coord in coords]
    centroid_x = sum(x_coords) / len(coords)
    centroid_y = sum(y_coords) / len(coords)
    return (centroid_x, centroid_y)


## Find the bearing between the home plate and the center of the field
import math

def calculate_bearing(point1, point2):
    lat1, lon1 = math.radians(point1[1]), math.radians(point1[0])
    lat2, lon2 = math.radians(point2[1]), math.radians(point2[0])

    d_lon = lon2 - lon1

    x = math.cos(lat2) * math.sin(d_lon)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(d_lon)

    bearing = math.degrees(math.atan2(x, y))
    bearing = (bearing + 360) % 360  # Normalize the bearing to the range [0, 360)

    return bearing

### Function to classify direction in laymans terms North, South, East, West, ect
def degrees_to_cardinal_direction(degrees):
    directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N']
    index = round(degrees / 45)
    return directions[index]



In [21]:
# Calculate the centroid of the outfield fence coordinates for each row
df_cleaned['fop_centroid'] = df_cleaned['fop'].apply(lambda coords: calculate_centroid(coords[1:]))

# Calculate the bearing between home plate and the centroid for each row
df_cleaned['field_orientation'] = df_cleaned.apply(lambda row: calculate_bearing(row['home_plate'], row['fop_centroid']), axis=1)

# Convert the bearing to a cardinal direction
df_cleaned['field_cardinal_direction'] = df_cleaned['field_orientation'].apply(degrees_to_cardinal_direction)



In [22]:
### Rename the Cleaned Dataframe back to the default name
# df = df_cleaned

In [23]:
# ### In the fop, foul and home_plate columns, the coordinates are in the format (longitude, latitude).
# ### This is the opposite of the format that is used in Google Maps, so we need to reverse the order of the coordinates.

# df['fop'] = df['fop'].apply(lambda coords: [(coord[1], coord[0]) for coord in coords])
# df['foul'] = df['foul'].apply(lambda coords: [(coord[1], coord[0]) for coord in coords])
# df['home_plate'] = df['home_plate'].apply(lambda coord: (coord[1], coord[0]))


In [24]:
df_cleaned.head(20)


Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,min_distance_rank,avg_distance_rank,median_distance_rank,field_area_rank,foul_area_rank,fop_area_per_rank,ratio_rank,fop_centroid,field_orientation,field_cardinal_direction
0,Ann Arbor Skyline HS,"[(-83.7754672, 42.304163), (-83.7755027, 42.30...","[(-83.7754672, 42.304163), (-83.7743031, 42.30...",High School,"(-83.7754672, 42.304163)",34964.691149,92699.203934,127663.895083,0.273881,2.651223,...,147.0,152.0,122.0,86.0,73.0,160.0,393.0,"(-83.77469396969695, 42.304841136363635)",40.140275,NE
1,Bellaire High School,"[(-85.1912578, 44.9766363), (-85.1902399, 44.9...","[(-85.1912578, 44.9766363), (-85.1906288, 44.9...",High School,"(-85.1912578, 44.9766363)",22921.180838,85543.834852,108465.01569,0.211323,3.732087,...,328.0,378.0,366.0,391.0,365.0,344.0,139.0,"(-85.19019890833334, 44.976457197222224)",103.446845,E
2,Benton Harbor HS - high_scool,"[(-86.4624276, 42.1020796), (-86.4622586, 42.1...","[(-86.4624276, 42.1020796), (-86.4612434, 42.1...",High School,"(-86.4624276, 42.1020796)",32035.581645,103562.743401,135598.325046,0.236254,3.232741,...,29.0,24.0,34.0,26.0,135.0,11.0,243.0,"(-86.46142913703704, 42.10250644074075)",60.049884,NE
3,Berrien Springs HS,"[(-86.346764, 41.9452802), (-86.3466903, 41.94...","[(-86.346764, 41.9452802), (-86.347963, 41.945...",High School,"(-86.346764, 41.9452802)",26097.189734,90254.560017,116351.749751,0.224296,3.458401,...,295.0,194.0,235.0,266.0,289.0,219.0,191.0,"(-86.34750639600001, 41.944627432)",220.228554,SW
4,Brooklyn Columbia Central HS,"[(-84.2805033, 42.0874482), (-84.2811256, 42.0...","[(-84.2805033, 42.0874482), (-84.2795833, 42.0...",High School,"(-84.2805033, 42.0874482)",28305.998957,80109.952635,108415.951592,0.261087,2.83014,...,399.0,428.0,455.0,393.0,230.0,448.0,348.0,"(-84.2803677, 42.088261404545456)",7.054243,N
5,Central Lake HS,"[(-85.2690937, 45.0671277), (-85.2679149, 45.0...","[(-85.2690937, 45.0671277), (-85.269142, 45.06...",High School,"(-85.2690937, 45.0671277)",17031.960373,75950.501772,92982.462144,0.183174,4.459293,...,407.0,455.0,463.0,472.0,460.0,466.0,52.0,"(-85.26837954594596, 45.06659005945946)",136.827198,SE
6,Croswell-Lexington HS,"[(-82.5997748, 43.2674602), (-82.5998486, 43.2...","[(-82.5997748, 43.2674602), (-82.5985947, 43.2...",High School,"(-82.5997748, 43.2674602)",20877.921771,86121.164373,106999.086144,0.195122,4.124987,...,147.0,328.0,322.0,409.0,405.0,333.0,82.0,"(-82.59905028846153, 43.268063453846146)",41.170188,NE
7,Durand HS - high school,"[(-83.9828578, 42.9168369), (-83.9829396, 42.9...","[(-83.9828578, 42.9168369), (-83.9817031, 42.9...",High School,"(-83.9828578, 42.9168369)",22506.415172,88814.942727,111321.3579,0.202175,3.946206,...,225.0,302.0,293.0,351.0,374.0,266.0,103.0,"(-83.98229320833333, 42.917406116666676)",35.994052,NE
8,Eaton Rapids High School,"[(-84.6351432, 42.5118215), (-84.6360042, 42.5...","[(-84.6351432, 42.5118215), (-84.6343814, 42.5...",High School,"(-84.6351432, 42.5118215)",38639.457736,101086.884585,139726.342321,0.276537,2.616157,...,246.0,27.0,22.0,12.0,35.0,25.0,398.0,"(-84.63518998974362, 42.51269676410258)",357.743412,N
9,Powers Catholic HS,"[(-83.7055549, 43.0025635), (-83.7061516, 43.0...","[(-83.7055549, 43.0025635), (-83.7045396, 43.0...",High School,"(-83.7055549, 43.0025635)",28473.249984,97620.131234,126093.381218,0.225811,3.428486,...,147.0,63.0,39.0,106.0,227.0,62.0,201.0,"(-83.70529651304346, 43.00339221521738)",12.844848,N


In [25]:
# Rename the Cleaned Dataframe back to the default name
df = df_cleaned

In [26]:
######### TOGGLE THIS TO CHANGE THE DATAFRAME USED FOR THE REST OF THE NOTEBOOK #########


### Rename the michigan dataframe to df so it can be used in the rest of the notebook
# hs_df = mi_df
hs_df = df_cleaned


In [27]:
import pandas as pd
from fuzzywuzzy import fuzz, process

def find_best_match(school_name, choices, score_cutoff=80):
    best_match = process.extractOne(school_name, choices, scorer=fuzz.token_sort_ratio, score_cutoff=score_cutoff)
    if best_match:
        return best_match[0]
    else:
        return None

# Read CSV files
mhsaa_df = pd.read_csv('data/school_info/mhsaa_enrolment_2022.csv')
name_color_df = pd.read_csv('data\school_info\mhsaa_school_nickname_color_2020.csv')

# Get the list of park names from hs_df
park_names = hs_df['park_name'].tolist()

# Apply find_best_match function to create a new column 'best_match' in mhsaa_df
mhsaa_df['best_match'] = mhsaa_df['school_name'].apply(find_best_match, choices=park_names, score_cutoff=90)

## Pull the school_id, school_name, students, and division columns from mhsaa_df and add to hs_df
columns_to_extract = ['school_id', 'school_name', 'students', 'division']
for col in columns_to_extract:
    hs_df[col] = hs_df['park_name'].apply(lambda x: mhsaa_df.loc[mhsaa_df['best_match'] == x, col].iloc[0] if not mhsaa_df.loc[mhsaa_df['best_match'] == x, col].empty else None)

# Apply find_best_match function to create a new column 'best_match' in name_color_df
name_color_df['best_match'] = name_color_df['School'].apply(find_best_match, choices=park_names, score_cutoff=80)

## Pull the data from the name_color_df and add to hs_df (Nickname,Color1,Color2,Color3,Color4)
columns_to_extract = ['Nickname', 'Color1', 'Color2', 'Color3', 'Color4']
for col in columns_to_extract:
    hs_df[col] = hs_df['park_name'].apply(lambda x: name_color_df.loc[name_color_df['best_match'] == x, col].iloc[0] if not name_color_df.loc[name_color_df['best_match'] == x, col].empty else None)

# Drop the 'best_match' columns
# hs_df.drop(columns=['best_match'], inplace=True)

## Take a look at the new hs_df
hs_df.head()



# # Lookup the mhsaa_df['best_match'] in hs_df and return the columns: 'school_id', 'school_name', 'students', 'division'
# columns_to_extract = ['school_id', 'school_name', 'students', 'division']
# for col in columns_to_extract:
#     mhsaa_df[col] = mhsaa_df['best_match'].apply(lambda x: hs_df.loc[hs_df['park_name'] == x, col].iloc[0] if not hs_df.loc[hs_df['park_name'] == x, col].empty else None)

# # Apply find_best_match function to create a new column 'best_match' in name_color_df
# name_color_df['best_match'] = name_color_df['School'].apply(find_best_match, choices=park_names, score_cutoff=80)

# # Merge hs_df with mhsaa_df and name_color_df on the 'park_name' and 'best_match' columns
# hs_df = hs_df.merge(mhsaa_df, left_on='park_name', right_on='best_match', how='left', suffixes=('', '_from_mhsaa'))
# hs_df = hs_df.merge(name_color_df, left_on='park_name', right_on='best_match', how='left', suffixes=('', '_from_name_color'))

# # Drop the 'best_match' columns
# hs_df.drop(columns=['best_match_x', 'best_match_y'], inplace=True)


Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,field_cardinal_direction,school_id,school_name,students,division,Nickname,Color1,Color2,Color3,Color4
0,Ann Arbor Skyline HS,"[(-83.7754672, 42.304163), (-83.7755027, 42.30...","[(-83.7754672, 42.304163), (-83.7743031, 42.30...",High School,"(-83.7754672, 42.304163)",34964.691149,92699.203934,127663.895083,0.273881,2.651223,...,NE,2032.0,Ann Arbor - Skyline HS,1588.0,A,Eagles,Columbia Blue,White,,
1,Bellaire High School,"[(-85.1912578, 44.9766363), (-85.1902399, 44.9...","[(-85.1912578, 44.9766363), (-85.1906288, 44.9...",High School,"(-85.1912578, 44.9766363)",22921.180838,85543.834852,108465.01569,0.211323,3.732087,...,E,,,,,,,,,
2,Benton Harbor HS - high_scool,"[(-86.4624276, 42.1020796), (-86.4622586, 42.1...","[(-86.4624276, 42.1020796), (-86.4612434, 42.1...",High School,"(-86.4624276, 42.1020796)",32035.581645,103562.743401,135598.325046,0.236254,3.232741,...,NE,,,,,,,,,
3,Berrien Springs HS,"[(-86.346764, 41.9452802), (-86.3466903, 41.94...","[(-86.346764, 41.9452802), (-86.347963, 41.945...",High School,"(-86.346764, 41.9452802)",26097.189734,90254.560017,116351.749751,0.224296,3.458401,...,SW,1346.0,Berrien Springs HS,491.0,B,Shamrocks,Green,White,,
4,Brooklyn Columbia Central HS,"[(-84.2805033, 42.0874482), (-84.2811256, 42.0...","[(-84.2805033, 42.0874482), (-84.2795833, 42.0...",High School,"(-84.2805033, 42.0874482)",28305.998957,80109.952635,108415.951592,0.261087,2.83014,...,N,1453.0,Brooklyn - Columbia Central HS,391.0,C,Golden Eagles,Blue,Gold,,


In [28]:
### examine the output

hs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 536 non-null    object 
 1   foul                      536 non-null    object 
 2   fop                       536 non-null    object 
 3   level                     536 non-null    object 
 4   home_plate                536 non-null    object 
 5   foul_area_sqft            536 non-null    float64
 6   fop_area_sqft             536 non-null    float64
 7   field_area_sqft           536 non-null    float64
 8   foul_area_per             536 non-null    float64
 9   fair_to_foul              536 non-null    float64
 10  distances                 536 non-null    object 
 11  max_distance              536 non-null    int64  
 12  min_distance              536 non-null    int64  
 13  avg_distance              536 non-null    float64
 14  median_dis

In [29]:
######## MICHIGAN FIELDS ########
########### TOGGLE THIS ON AND OFF TO OUTPUT THE DATAFRAME TO A json FILE ###########

hs_df.to_json('data/michigan_fields.json', orient='records')

## END OF MICHIGAN OUTPUT 

In [30]:

# merge the two dataframes back to a single one (using append)

# Merge the hs_df and other_df back together
merged_df = pd.concat([hs_df, other_df], ignore_index=True)

In [31]:
# merged_df = df
merged_df.head(10)

# # Save the merged_df DataFrame as a JSON file
merged_df.to_json('data/default_updated_output.json', orient='records')


In [32]:
column_list = merged_df.columns
print(column_list)

# ## Print a list of ten high school field names to test mascot lookup
# print(merged_df[merged_df['level'] == 'high_school']['park_name'].head(10).to_string())

Index(['park_name', 'foul', 'fop', 'level', 'home_plate', 'foul_area_sqft',
       'fop_area_sqft', 'field_area_sqft', 'foul_area_per', 'fair_to_foul',
       'distances', 'max_distance', 'min_distance', 'avg_distance',
       'median_distance', 'num_distances', 'lng', 'lat', 'state', 'country',
       'max_distance_rank', 'min_distance_rank', 'avg_distance_rank',
       'median_distance_rank', 'field_area_rank', 'foul_area_rank',
       'fop_area_per_rank', 'ratio_rank', 'fop_centroid', 'field_orientation',
       'field_cardinal_direction', 'school_id', 'school_name', 'students',
       'division', 'Nickname', 'Color1', 'Color2', 'Color3', 'Color4'],
      dtype='object')


In [33]:
# ### Code to create report on the json file structure to be used as a reference later
# import json
# import pandas as pd
# from collections import defaultdict
# from builtins import list, dict

# # Load the JSON data from the file
# with open('data/updated_output_data.json', 'r') as file:
#     data = json.load(file)

# # Analyze the JSON data
# def analyze_structure(data, prefix=''):
#     structure = defaultdict(set)
    
#     if isinstance(data, dict):
#         for key, value in data.items():
#             new_prefix = f'{prefix}.{key}' if prefix else key
#             structure[new_prefix].add(type(value))
#             structure.update(analyze_structure(value, new_prefix))
#     elif isinstance(data, list):
#         for item in data:
#             structure.update(analyze_structure(item, prefix))
    
#     return structure

# # Generate the report
# structure = analyze_structure(data)
# descriptions = {
#     'field_name': 'The name of the baseball field.',
#     'foul': 'A list of coordinates representing the foul territory of the field. (lat, lon)',
#     'fop': 'A list of coordinates representing the fair territory of the field. (lat, lon)',
#     'level': 'The level of the field, e.g., high_school, college, etc.',
#     'home_plate': 'A list of coordinates representing the home plate location on the field. (lat, lon)',
#     'foul_area_sqft': 'The total area of the foul territory in square feet.',
#     'fop_area_sqft': 'The total area of the fair territory in square feet.',
#     'distances': 'A list of distances from home plate to the outfield fence at the vertices of the wall.',
#     'max_distance': 'The maximum distance from home plate to the outfield fence.',
#     'min_distance': 'The minimum distance from home plate to the outfield fence.',
#     'avg_distance': 'The average distance from home plate to the outfield fence.',
#     'fop_centroid': 'A list of coordinates representing the centroid of the fair territory.',
#     'field_orientation': "The angle (in degrees) of the field's orientation, with 0 degrees being North.",
#     'field_cardinal_direction': "The cardinal direction abbreviation (N, S, E, W, NE, NW, SE, SW) representing the field's orientation.",
#     'match': 'The matched school name found using fuzzy matching.',
#     'school_id': 'The unique identifier of the matched school.',
#     'school_name': 'The name of the matched school.',
#     'students': 'The number of students enrolled in the matched school.',
#     'division': 'The athletic division the matched school belongs to.'
# }

# # Replace <filename> with your desired filename without the extension
# filename = "output_data"

# def get_sample_value(data, key):
#     for item in data:
#         if key in item and item[key] is not None:
#             return item[key]
#     return None

# # Generate the report with sample values
# report = pd.DataFrame([(key, ', '.join([t.__name__ for t in types]), descriptions.get(key, ''), get_sample_value(data, key)) 
#                        for key, types in structure.items()],
#                       columns=['Key', 'Data Types', 'Description', 'Sample Value'])

# # Save the report as a CSV file
# report.to_csv(f"{filename}_report.csv", index=False)

# print(report)



In [34]:
# ### Code to create report on some interesting stats like total fields, total area, ect

# ## Create a function to measure the total distance of the outside of each polygon
# from shapely.geometry import Polygon
# import pyproj

# def calculate_perimeter(coords):

#     perimeter = 0

#     for i in range(len(coords)):
#         x1, y1 = coords[i]
#         x2, y2 = coords[(i + 1) % len(coords)]
#         perimeter += math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

#     return perimeter

# # Get a sum of all of the perimeters
# total_perimeter_fop = df['fop'].apply(calculate_perimeter).sum()
# total_perimeter_foul = df['foul'].apply(calculate_perimeter).sum()




In [35]:
import json

with open('data/default_updated_output.json', 'r') as file:
    data = json.load(file)



In [36]:
# create a function to process the data, counting the orientations and filtering by level.

from collections import defaultdict

def process_data(data, level_filter=None):
    count_by_orientation = defaultdict(int)
    
    for record in data:
        if level_filter is None or record['level'] == level_filter:
            orientation = round(record['field_orientation'])
            count_by_orientation[orientation] += 1

    return count_by_orientation


In [37]:
def create_polar_chart(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    ###
    # ax.set_yscale('log')

    
    # # Set dark background
    # ax.set_facecolor('#2b2b2b')
    plt.gca().set_rlabel_position(22.5)
    ax.set_ylim(-10, 100)  # Adjust based on max count

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=-20)
    


    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(plt.cm.viridis(r / max(bin_counts)))
        # bar.set_facecolor(plt.cm.plasma(r / max(bin_counts)))
        bar.set_alpha(0.8)

    plt.show()




In [38]:
### UPDATED GPT CODE - LATE NIGHT FRIDAY


import numpy as np
import matplotlib.pyplot as plt

def create_polar_chart(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    ###
    # ax.set_yscale('log')

    
    # # Set dark background
    ax.set_facecolor('#2b2b2b')
    plt.gca().set_rlabel_position(22.5)
    ax.set_ylim(-20, 130)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(plt.cm.viridis(r / max(bin_counts)))
        # bar.set_facecolor(plt.cm.plasma(r / max(bin_counts)))
        bar.set_alpha(0.8)

    plt.show()


In [39]:
create_polar_chart(data, num_bins=50, level_filter=None)


#### GOAL
## fill the center portion of the plot to create a heat map of the field orientations



TypeError: type NoneType doesn't define __round__ method

In [40]:
create_polar_chart(data, num_bins=60, level_filter=None)


TypeError: type NoneType doesn't define __round__ method

In [None]:
# ### HEATMAP CODE

# import numpy as np
# import matplotlib.pyplot as plt

# def create_heatmap(data, num_bins=36, level_filter=None):
#     count_by_orientation = process_data(data, level_filter)

#     # Compute the histogram
#     bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
#     bin_counts = np.zeros(num_bins)

#     for orientation, count in count_by_orientation.items():
#         idx = int(orientation / (360 / num_bins))
#         if idx == num_bins:
#             idx = 0
#         bin_counts[idx] += count

#     # Reshape histogram data into a 2D array
#     heatmap_data = np.tile(bin_counts, (num_bins, 1))

#     # Set plot size
#     plt.figure(figsize=(10, 10))

#     # Create heatmap
#     plt.imshow(heatmap_data, cmap='viridis', aspect='auto', interpolation='nearest', origin='lower')
#     plt.colorbar(label='Counts')

#     # Set x-axis ticks and labels
#     plt.xticks(np.arange(0, num_bins, num_bins // 6), np.arange(0, 361, 60))
#     plt.xlabel('Orientation (degrees)')

#     # Set y-axis ticks and labels (assuming equal radial divisions)
#     max_radius_label = 'Max Radius'
#     plt.yticks(np.arange(0, num_bins, num_bins // 6), [0, 1, 2, 3, 4, max_radius_label])
#     plt.ylabel('Radial Division')

#     plt.show()

# # Example usage:
# # create_heatmap(data)


In [None]:
# # Example usage:
# create_heatmap(data)

In [41]:
### Plot a histogram of the field orientations for all levels

import numpy as np
import matplotlib.pyplot as plt

def create_histogram(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 360.0, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 360 / num_bins

    # Plot the histogram
    fig, ax = plt.subplots()
    ax.bar(bin_edges[:-1], bin_counts, width=bin_width, edgecolor='black')
    ax.set_xlabel('Field Orientation (degrees)')
    ax.set_ylabel('Number of Fields')
    ax.set_title('Field Orientation Histogram')

    # Set the major tick locations
    major_tick_locations = [45, 135, 225, 315]
    plt.xticks(major_tick_locations, major_tick_locations)


    
    plt.show()


In [42]:
create_histogram(data, num_bins=36, level_filter=None)

TypeError: type NoneType doesn't define __round__ method

END WORK BLOCK

In [43]:
df['field_cardinal_direction'].value_counts()

NE    227
SW     74
NW     73
SE     65
N      55
S      29
E       9
W       4
Name: field_cardinal_direction, dtype: int64

In [None]:
import matplotlib.pyplot as plt
# Rename the dataframe back to the default name
df = merged_df

# Set the plot size
plt.figure(figsize=(10, 6))

# Create the scatter plot
plt.scatter(df_cleaned['min_distance'], df_cleaned['max_distance'], alpha=0.8)

# Customize the plot
plt.xlabel('Min Distance (feet)')
plt.ylabel('Max Distance (feet)')
plt.title('Scatter Plot of Min and Max Distances to Outfield Fence')

# Display the plot in the Jupyter Notebook
plt.show()


In [44]:
##### CODE TO MAKE A LIST OF OUTLIERS #####

# Filter the DataFrame for fields with min distances below 100 feet
outliers = df[df['min_distance'] < 100]

# Display the outlier fields in the Jupyter Notebook
print(outliers[['park_name', 'min_distance', 'max_distance']])

# Save the outlier fields to a CSV file
outliers.to_csv('outlier_fields.csv', index=False)



                                park_name  min_distance  max_distance
266               Ann Arbor Greenhills HS             8           422
267                  Ann Arbor Skyline HS             7           442
279                           Bellaire HS             4           443
280                      Benton Harbor HS            10           464
297                             Canton HS            20           449
378                              Homer HS            17           406
480                    Richland Gull Lake             5           434
508    Traverse City Central HS - field 1             5           419
510  Traverse City Central HS - Old Field             8           428
512      Traverse City West HS- old field             5           452


In [45]:
len(outliers)

10

In [None]:
df.head()

In [None]:
end_time = time.time()
total_time = end_time - start_time
print("Total time taken:", total_time, "seconds")
print("Total time taken:", total_time/60, "minutes")


