In [21]:
### BOOK TO WORK OUT THE CODE FOR BASEBALL FIELD COMPARISON

### TAKING THE BONES FROM ETL_FOR_JSON BUT IT NEVER WORKED IN THAT BOOK

## 1. IMPORT LIBRARIES
from bs4 import BeautifulSoup
import pandas as pd
import geopandas as gpd
import os
import json
import numpy as np

import re
import time

import pyproj
from shapely.geometry import Polygon
from shapely.geometry import Point
from shapely.ops import transform
from shapely.affinity import rotate


from geopy.distance import great_circle
import matplotlib.pyplot as plt


start_time = time.time()

## 2. SET UP PATHS
## KML path for the baseball fields
file_path = ('data/kml/MHSAA_2023.kml') # file path to kml file

## 2A. SET UP NESSISARY DICTIONARIES
# Define a dictionary that maps level indicators to levels and size factors
level_dict = {
    'International': 'international',
    'Major Leagues': 'mlb', 
    'Professional': 'pro', 
    'College': 'college', 
    'High School': 'high_school',
    'Youth': 'youth',
    
}




In [22]:
## Load the KML File to evaluate
with open(file_path) as file:

    xml_data = file.read()

# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')
folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')

## Create a dataframe to hold the data parsed from xml
df = pd.DataFrame(columns=['field', 'foul', 'fop'])

failed = []

## 3. PARSE THE KML FILE
# Create an empty list to store the rows to append to the DataFrame
rows = []

# Loop through the folders and extract the data
for folder in list:
    try:
        field_name = folder.find('name').text
        foul = folder.find_all('coordinates')[0].text
        fop = folder.find_all('coordinates')[1].text

        row = {
            'field': field_name,
            'foul': foul,
            'fop': fop
        }

        rows.append(row)

    except Exception as e:
        # Add name of folder to a list of failed folders
        failed.append(folder.find('name').text)
        print(f"Error processing folder: {folder.find('name').text}. Error message: {str(e)}")

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows)

# Print a list of failed folders
print(f"Failed to process {len(failed)} folders: {', '.join(failed)}")

# Create a copy of the original DataFrame
df_cleaned = df.copy()

# Remove new line and space characters from coordinates
df_cleaned = df_cleaned.replace(r'\n','', regex=True) 
df_cleaned = df_cleaned.replace(r'\t','', regex=True) 

# Drop any duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=['field'], keep='first')

# Drop any rows with empty fields
df_cleaned = df_cleaned[(df_cleaned != 0).all(1)]

# Define the regex patterns for each level
re_mlb = re.compile(r'mlb', re.IGNORECASE)
re_pro = re.compile(r'pro|semi[-\s]*pro', re.IGNORECASE)
re_college = re.compile(r'college', re.IGNORECASE)
re_high_school = re.compile(r'high school|hs', re.IGNORECASE)  # Include the abbreviation 'hs'
re_youth = re.compile(r'youth', re.IGNORECASE)
re_muni = re.compile(r'muni', re.IGNORECASE)
re_international = re.compile(r'international', re.IGNORECASE)

# Define a function to classify the fields based on the regex patterns
def classify_field(field_name):
    if re_mlb.search(field_name):
        return 'Major League'
    elif re_pro.search(field_name):
        return 'Professional'
    elif re_college.search(field_name):
        return 'College'
    elif re_high_school.search(field_name):
        return 'High School'
    elif re_youth.search(field_name):
        return 'Youth'
    elif re_muni.search(field_name):
        return 'State / County / Municipal'
    elif re_international.search(field_name):
        return 'International'
    else:
        return 'Unknown'

# Apply the classify_field function to the 'field' column
df_cleaned['level'] = df_cleaned['field'].apply(classify_field)

# Clean up the 'field' column by removing the level indicator and any trailing '-' characters
level_regex = r'\s*(%s)\s*' % '|'.join(re.escape(level) for level in level_dict.values())
df_cleaned['field'] = df_cleaned['field'].str.replace(level_regex, '', regex=True, flags=re.IGNORECASE)
df_cleaned['field'] = df_cleaned['field'].str.replace(r'-\s*$', '', regex=True)

# Rename field column to park_name to avoid confusion down the line
df_cleaned = df_cleaned.rename(columns={'field': 'park_name'})

##### Clean up polygon data and create a new home_plate column
def parse_coordinates(coord_string):
    coords = coord_string.split()
    parsed_coords = [tuple(map(float, coord.split(',')[:2])) for coord in coords]
    return parsed_coords

# Create a new column for the home_plate location using the first set of coordinates in the 'fop' column
df_cleaned['home_plate'] = df_cleaned['fop'].apply(lambda x: parse_coordinates(x)[0])
# Apply the parse_coordinates function to the 'foul' and 'fop' columns
df_cleaned['foul'] = df_cleaned['foul'].apply(parse_coordinates)
df_cleaned['fop'] = df_cleaned['fop'].apply(parse_coordinates)

# 4. PROFORM GEOGRAPHIC CALCULATIONS - DISTANCE, AREA, ETC.
def calculate_area(coords):
    # Create a Polygon object from the coordinates
    polygon = Polygon(coords)

    # Calculate the centroid of the polygon
    centroid = polygon.centroid

    # Create a custom LAEA projection centered on the centroid
    custom_projection = f"+proj=laea +lat_0={centroid.y} +lon_0={centroid.x} +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

    # Create a transformer for converting coordinates to the custom LAEA projection
    transformer = pyproj.Transformer.from_crs(
        pyproj.CRS("EPSG:4326"),  # WGS 84 (latitude and longitude)
        pyproj.CRS(custom_projection),  # Custom LAEA projection
        always_xy=True
    )

    # Define a function to transform coordinates using the transformer
    def transform_coordinates(x, y):
        return transformer.transform(x, y)

    # Convert the coordinates to the custom LAEA projection
    polygon_laea = transform(transform_coordinates, polygon)

    # Calculate the area in square meters
    area_sqm = polygon_laea.area

    # Convert the area to square feet (1 square meter = 10.764 square feet)
    area_sqft = area_sqm * 10.764

    return area_sqft



### Call Function and add to dataframe
df_cleaned['foul_area_sqft'] = df_cleaned['foul'].apply(calculate_area)
df_cleaned['fop_area_sqft'] = df_cleaned['fop'].apply(calculate_area)

## Calculate the total area of the field and the ratio of foul area to field area
df_cleaned['field_area_sqft'] = df_cleaned['foul_area_sqft'] + df_cleaned['fop_area_sqft']
## Percentage foul area
df_cleaned['foul_area_per'] = df_cleaned['foul_area_sqft'] / df_cleaned['field_area_sqft']
## Fair to Foul Ratio
df_cleaned['fair_to_foul'] = df_cleaned['fop_area_sqft'] / df_cleaned['foul_area_sqft']

# 4B. Calculate the distance from home plate to the outfield fences
def interpolate_points(start, end, length_ratio):
    start_np = np.array(start)
    end_np = np.array(end)
    return tuple(start_np + (end_np - start_np) * length_ratio)

def calculate_distances(home_plate, outfield_coords, num_points=540):
    def is_same_point(point1, point2, tolerance=1e-6):
        return abs(point1[0] - point2[0]) < tolerance and abs(point1[1] - point2[1]) < tolerance

    home_plate_lat_lon = (home_plate[1], home_plate[0])
    distances = []

    # Calculate total line length
    total_length = 0
    segments = []
    for i in range(len(outfield_coords) - 1):
        start = outfield_coords[i]
        end = outfield_coords[i + 1]
        if not is_same_point(home_plate, start) and not is_same_point(home_plate, end):
            segment_length = great_circle((start[1], start[0]), (end[1], end[0])).feet
            segments.append((start, end, segment_length))
            total_length += segment_length

    # Calculate the distance between equally spaced points
    spacing = total_length / (num_points - 1)

    # Interpolate points and calculate distances
    current_length = 0
    segment_index = 0
    for i in range(num_points):
        while segment_index < len(segments) - 1 and current_length > segments[segment_index][2]:
            current_length -= segments[segment_index][2]
            segment_index += 1

        start, end, segment_length = segments[segment_index]
        length_ratio = current_length / segment_length
        point = interpolate_points(start, end, length_ratio)
        distance = great_circle(home_plate_lat_lon, (point[1], point[0])).feet
        distances.append(distance)

        current_length += spacing

    return distances

# Calculate distances for each row
df_cleaned['distances'] = df_cleaned.apply(lambda row: calculate_distances(row['home_plate'], row['fop']), axis=1)

# Calculate max, min, and average distances for each row
df_cleaned['max_distance'] = df_cleaned['distances'].apply(max)
df_cleaned['min_distance'] = df_cleaned['distances'].apply(min)
df_cleaned['avg_distance'] = df_cleaned['distances'].apply(lambda distances: sum(distances) / len(distances))
# get the median distance
df_cleaned['median_distance'] = df_cleaned['distances'].apply(lambda distances: np.median(distances))

## Return the dataframe as df
df = df_cleaned




Failed to process 0 folders: 


In [23]:
## Display the dataframe so I can start working on the new functions
print(df.info())

print(df.columns)

df.sample()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 143
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   park_name        140 non-null    object 
 1   foul             140 non-null    object 
 2   fop              140 non-null    object 
 3   level            140 non-null    object 
 4   home_plate       140 non-null    object 
 5   foul_area_sqft   140 non-null    float64
 6   fop_area_sqft    140 non-null    float64
 7   field_area_sqft  140 non-null    float64
 8   foul_area_per    140 non-null    float64
 9   fair_to_foul     140 non-null    float64
 10  distances        140 non-null    object 
 11  max_distance     140 non-null    float64
 12  min_distance     140 non-null    float64
 13  avg_distance     140 non-null    float64
 14  median_distance  140 non-null    float64
dtypes: float64(9), object(6)
memory usage: 17.5+ KB
None
Index(['park_name', 'foul', 'fop', 'level', 'home_plate

Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,distances,max_distance,min_distance,avg_distance,median_distance
50,Grandville Calvin Christian HS,"[(-85.7397914, 42.8969115), (-85.7398189, 42.8...","[(-85.7397914, 42.8969115), (-85.7385944, 42.8...",High School,"(-85.7397914, 42.8969115)",26719.782978,91339.323263,118059.106241,0.226325,3.418416,"[320.0554765288528, 320.07221491837936, 320.09...",372.445736,304.197134,342.133257,342.693775


In [24]:
## Reverse the order of the tuples within the coordinate columns (foul, fop, home_plate)
def reverse_tuples(coords):
    return [(coord[1], coord[0]) for coord in coords]

df['foul'] = df['foul'].apply(reverse_tuples)
df['fop'] = df['fop'].apply(reverse_tuples)

# Reverse the home plate coordinates single tuple
df['home_plate'] = df['home_plate'].apply(lambda coord: (coord[1], coord[0]))



gdf = df.copy()

In [25]:
gdf.sample()

Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,distances,max_distance,min_distance,avg_distance,median_distance
63,Hillman HS,"[(45.0441796, -83.9033908), (45.0450608, -83.9...","[(45.0441796, -83.9033908), (45.0450608, -83.9...",High School,"(45.0441796, -83.9033908)",28247.163518,89762.303543,118009.467061,0.239364,3.177746,"[321.474149278946, 321.48313051617424, 321.495...",350.844496,319.05898,337.679217,340.62645


In [26]:
## stop timer and display time
end_time = time.time()

print(f"Total time: {end_time - start_time} seconds")

Total time: 5.13925576210022 seconds


In [27]:
# Create a geometry column from the 'fop' column so It can be used as a geodataframe
gdf['geometry'] = gdf['fop'].apply(lambda coords: Polygon(coords))

In [28]:
gdf.columns

## print two values from the geometry column
print(gdf['geometry'][0])
print(gdf['geometry'][1])

POLYGON ((42.3966942 -83.1678186, 42.3967249 -83.1665385, 42.3967827 -83.1665432, 42.3968426 -83.1665552, 42.3969035 -83.1665733, 42.3969565 -83.1665901, 42.3970026 -83.1666075, 42.3970531 -83.1666344, 42.3971041 -83.1666632, 42.3971477 -83.1666967, 42.3972046 -83.166739, 42.3972541 -83.1667859, 42.3973027 -83.1668382, 42.3973527 -83.1668986, 42.3973913 -83.1669489, 42.3974304 -83.1670126, 42.3974606 -83.1670635, 42.3974903 -83.1671212, 42.3975186 -83.1671775, 42.3975384 -83.1672265, 42.3975582 -83.1672774, 42.397575 -83.1673297, 42.3975869 -83.167374, 42.3976017 -83.1674303, 42.3976131 -83.167484, 42.3976206 -83.1675249, 42.39763 -83.1675725, 42.3976354 -83.1676147, 42.3976409 -83.1676576, 42.3976438 -83.1677005, 42.3976453 -83.1677622, 42.3976478 -83.1678152, 42.3976484 -83.1678776, 42.3966942 -83.1678186))
POLYGON ((41.9091676 -84.0416584, 41.9091846 -84.0405493, 41.9092385 -84.0405466, 41.9093163 -84.0405533, 41.9093951 -84.0405574, 41.909486 -84.0405614, 41.9095858 -84.0405654, 41

In [29]:
### FUNCTIONS TO COMPARE THE FIELD DIMENSIONS TO EAACH OTHER
from ast import literal_eval
from shapely import affinity
from math import atan2


# Function to calculate shape complexity
def calculate_shape_complexity(polygon):
    hull = polygon.convex_hull
    return 1 - polygon.area / hull.area

# Function to calculate elongation
def calculate_elongation(polygon):
    return 4 * np.pi * polygon.area / (polygon.length ** 2)

# Function to calculate orientation
def calculate_orientation(polygon):
    # Create a bounding rectangle and rotate it to align with the x-axis
    rect = polygon.minimum_rotated_rectangle
    x, y = rect.exterior.coords.xy

    # find the orientation of the rectangle
    edges = np.array([np.diff(x), np.diff(y)]).T
    edge_lengths = np.sqrt(np.sum(edges**2, axis=1))
    longest_edge_index = np.argmax(edge_lengths)
    longest_edge = edges[longest_edge_index]
    
    # angle of the longest edge with respect to the x-axis
    angle = atan2(longest_edge[1], longest_edge[0])

    # rotate the polygon by the angle in degrees
    rotated_rect = affinity.rotate(rect, -angle, origin='centroid', use_radians=True)

    # Find the difference in x coordinates
    x_diff = rotated_rect.bounds[2] - rotated_rect.bounds[0]

    # Find the difference in y coordinates
    y_diff = rotated_rect.bounds[3] - rotated_rect.bounds[1]

    # Calculate orientation angle in degrees
    return np.degrees(np.arctan2(y_diff, x_diff))



In [30]:
## ORIGINAL FUNCTION
# def calculate_shape_complexity(fence_polygon):
#     # Calculate the area of the fence
#     fence_area = fence_polygon.area

#     # Calculate the circumference of a circle with the same area
#     circle_circumference = 2 * pi * (fence_area / pi) ** 0.5

#     # Calculate the complexity as the ratio of the actual fence length to the circle circumference
#     complexity = fence_polygon.length / circle_circumference

#     return complexity

In [31]:

# Calculate shape complexity, elongation, and orientation
# Apply the functions to the GeoDataFrame
gdf['shape_complexity'] = gdf['geometry'].apply(calculate_shape_complexity)
gdf['elongation'] = gdf['geometry'].apply(calculate_elongation)
gdf['orientation'] = gdf['geometry'].apply(calculate_orientation)


In [32]:
# Describe the shape complexity, elongation, and orientation and show the results
print(gdf['shape_complexity'].describe())
print(gdf['elongation'].describe())
print(gdf['orientation'].describe())

# Display the GeoDataFrame
gdf.sample()

count    1.400000e+02
mean     1.102749e-03
std      2.311264e-03
min     -2.220446e-16
25%      1.916914e-04
50%      5.069609e-04
75%      1.066119e-03
max      2.213391e-02
Name: shape_complexity, dtype: float64
count    140.000000
mean       0.787524
std        0.028877
min        0.687954
25%        0.773190
50%        0.795074
75%        0.805381
max        0.855822
Name: elongation, dtype: float64
count    140.000000
mean      35.767464
std        1.353100
min       32.458717
25%       34.985359
50%       35.803862
75%       36.416309
max       41.608902
Name: orientation, dtype: float64


Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,distances,max_distance,min_distance,avg_distance,median_distance,geometry,shape_complexity,elongation,orientation
138,Wayne State Univ,"[(42.3564168, -83.0772235), (42.3571998, -83.0...","[(42.3564168, -83.0772235), (42.3567637, -83.0...",College,"(42.3564168, -83.0772235)",25306.645536,98298.905696,123605.551231,0.204737,3.884312,"[319.841633538307, 320.0453117324169, 320.2530...",420.712768,310.470166,357.813808,361.912975,"POLYGON ((42.3564168 -83.0772235, 42.3567637 -...",0.022134,0.742654,33.462492


# Description of new calculations

shape_complexity: This metric represents how complex the shape of the baseball field is compared to its convex hull. A convex hull is the smallest convex polygon that encloses all the points of the shape. The measure is calculated as 1 minus the ratio of the area of the polygon to the area of its convex hull. If the value is close to 0, it means that the shape is similar to its convex hull, i.e., it's fairly regular or "simple". If the value is larger, it means the shape is more complex or "irregular", compared to a convex polygon. The values can range between 0 and 1. In your dataset, the maximum value is ~0.74, which means there are some fairly complex shapes in the set.

elongation: This measure is based on how elongated the shape of the baseball field is. It is calculated as the ratio of the area of the polygon to the square of its perimeter, times 4π. If the shape is perfectly circular, the elongation will be 1. If the shape is more elongated or irregular, the value will be less than 1. In your dataset, the values range between ~0.12 and ~0.87, which means there are a variety of shapes, from somewhat circular to very elongated.

orientation: This metric represents the orientation angle of the shape's minimum rotated rectangle. The minimum rotated rectangle is the smallest rectangle that can enclose the shape, and can be oriented in any direction. The angle is calculated with respect to the x-axis. The values range between -90 to 90 degrees, where 0 means aligned with the x-axis. In your dataset, the values range between ~29.25 and ~44.82 degrees, which means that the minimum bounding rectangles of the baseball fields have a variety of different orientations, all within this range.

Please note that these are geometric measures and the actual interpretation can depend on the context. For instance, in the case of baseball fields, the elongation might give you an idea of whether the field is more circular or elongated (which could impact how the game is played), the shape complexity might hint at how "standard" or "irregular" the field is, and the orientation might tell you how the field is oriented geographically (which could impact factors like sun exposure).

In [33]:
#### Get a subset of fields and create the plots of fence distance witht he name of the field and the new calculated values

## will pull the graphing code from NCAA Book

In [34]:
### Get sample of 50 fields

df = gdf.sample(50)


## create the min max and mean fence distance rows
# Transpose the dataframe to get the 
transposed_df = pd.DataFrame(df['distances'].to_list()).transpose()

# Calculate min, max, mean, median, Q1 and Q3 for each row
min_fence_distances = transposed_df.min(axis=1)
max_fence_distances = transposed_df.max(axis=1)
mean_fence_distances = transposed_df.mean(axis=1)
median_fence_distances = transposed_df.median(axis=1)
## create profiles for standard deviation
std_fence_distances = transposed_df.std(axis=1)
first_fence_distances = mean_fence_distances + std_fence_distances
third_fence_distances = mean_fence_distances - std_fence_distances

# Create a new DataFrame to store these values
new_df = pd.DataFrame({
    'park_name': ['Min', 'Max', 'Mean', 'Median', 'Q1', 'Q3'],
    'distances': [
        min_fence_distances.tolist(), 
        max_fence_distances.tolist(),
        mean_fence_distances.tolist(),
        median_fence_distances.tolist(), # Add a comma here
        first_fence_distances.tolist(),
        third_fence_distances.tolist()
    ]
})

# For all other columns in the original DataFrame, add a column of NaN values in the new DataFrame
for column in df.columns:
    if column not in new_df.columns:
        new_df[column] = np.nan

# Concatenate the new DataFrame with the original one
df = pd.concat([df, new_df], ignore_index=True)

In [35]:
df.describe()

# df.tail()
# df.head()

Unnamed: 0,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,max_distance,min_distance,avg_distance,median_distance,shape_complexity,elongation,orientation
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,27853.892402,92025.640348,119879.532751,0.229929,3.51293,368.234662,313.566372,342.222192,342.645124,0.001008,0.790635,35.61279
std,7141.492619,8757.74738,12948.489469,0.04355,0.902636,20.147239,17.533478,15.446349,17.08019,0.00164,0.028244,1.402056
min,14202.987447,79128.149236,95787.436308,0.145504,2.200188,322.222126,281.412762,315.996117,314.138759,0.0,0.729279,32.458717
25%,23470.564813,87062.767129,112617.276113,0.202334,2.755464,351.687709,304.628672,333.403147,331.088955,0.000187,0.772966,34.916248
50%,26869.733785,90333.011095,118545.354083,0.224964,3.445203,368.405382,310.325799,339.995103,340.245567,0.000505,0.798091,35.696102
75%,32796.920145,95197.765152,125898.606055,0.26628,3.942331,384.624114,319.663565,348.318541,351.16805,0.000998,0.809394,36.354908
max,43132.874486,130343.678954,161845.668057,0.312482,5.872645,408.323717,401.367841,402.776088,402.912146,0.008232,0.839432,40.329047


In [36]:
# VERSION 1.0 - CREATE AND COMPARE TO AN IDEA FIELD (315 down the line 380 to center in an eaven arc)

## New block to generate an Ideal Field (for high school defined as 315 down the line 380 to center in an eaven arc)

import numpy as np

def generate_ideal_field(foul_pole_distance, center_field_distance, num_points=540):
    # Generate an array of angles from -45 to 45 degrees (representing the arc from one foul pole to the other)
    angles = np.linspace(-45, 45, num_points)

    # For each angle, calculate the ideal distance using the formulas provided
    ideal_distances = [foul_pole_distance if abs(angle) == 45 else center_field_distance / np.cos(np.radians(angle)) for angle in angles]

    return ideal_distances

ideal_field = generate_ideal_field(315, 380)

def compute_field_score(actual_distances, ideal_distances):
    # Calculate the absolute difference between the actual and ideal distances at each point
    differences = np.abs(np.array(actual_distances) - np.array(ideal_distances))

    # Sum up these differences to get the total deviation
    total_deviation = np.sum(differences)

    # Normalize this score to a 0-10 scale. We will use a scale factor for this normalization,
    # which you might need to adjust based on what you consider a "maximum" deviation.
    # For example, if you consider a field with an average deviation of 100 feet at each point to be the "worst" field (score 0),
    # then the scale factor would be 100 * number_of_points.
    # You will need to adjust this based on your domain knowledge and the actual data.
    scale_factor = 100 * len(actual_distances)
    score = 10 * (1 - total_deviation / scale_factor)

    # Clamp the score between 0 and 10 just in case
    score = max(0, min(10, score))

    return score




In [37]:
df['score'] = df['distances'].apply(lambda distances: compute_field_score(distances, ideal_field))


In [38]:
#


In [39]:
df.sort_values(by='score', ascending=False, inplace=True)

df.head(15)

Unnamed: 0,park_name,foul,fop,level,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,distances,max_distance,min_distance,avg_distance,median_distance,geometry,shape_complexity,elongation,orientation,score
22,Detroit Osborn HS,"[(42.4331463, -83.0017481), (42.434248, -83.00...","[(42.4331463, -83.0017481), (42.4331608, -83.0...",High School,"(42.4331463, -83.0017481)",31501.989103,130343.678954,161845.668057,0.194642,4.137633,"[402.00696458138873, 401.9709800430426, 401.93...",404.024027,401.367841,402.776088,402.912146,"POLYGON ((42.4331463 -83.0017481, 42.4331608 -...",0.000192,0.751745,35.432066,6.274523
51,Max,,,,,,,,,,"[402.00696458138873, 401.9709800430426, 401.93...",,,,,,,,,6.244538
36,Kalamazoo,"[(42.2877622, -85.6092979), (42.288667, -85.60...","[(42.2877622, -85.6092979), (42.2877533, -85.6...",College,"(42.2877622, -85.6092979)",30713.962108,109701.812755,140415.774863,0.218736,3.571725,"[326.48960618887725, 326.952818143186, 327.419...",399.40226,326.489606,374.018916,379.521151,"POLYGON ((42.2877622 -85.6092979, 42.2877533 -...",0.000468,0.839432,36.150525,4.023378
47,Newaygo High School - high school,"[(43.4134089, -85.7783332), (43.4142847, -85.7...","[(43.4134089, -85.7783332), (43.4132336, -85.7...",High School,"(43.4134089, -85.7783332)",33769.976994,102012.423178,135782.400172,0.248707,3.020802,"[320.26978270274617, 320.53475685407255, 320.8...",389.281906,320.269783,361.625533,366.413595,"POLYGON ((43.4134089 -85.7783332, 43.4132336 -...",0.0,0.826569,34.697536,3.525508
54,Q1,,,,,,,,,,"[333.07442804670285, 333.127671283223, 333.185...",,,,,,,,,3.354373
49,Cornerstone Baseball Field,"[(42.9811225, -85.5975511), (42.9819309, -85.5...","[(42.9811225, -85.5975511), (42.9814919, -85.5...",College,"(42.9811225, -85.5975511)",21202.261213,102966.625318,124168.88653,0.170753,4.856398,"[324.7534180021622, 324.8554588890298, 324.961...",400.067905,323.943203,362.630633,369.000332,"POLYGON ((42.9811225 -85.5975511, 42.9814919 -...",0.0,0.786498,34.988947,3.277242
31,Michigan State - OldField,"[(42.7318544, -84.4885965), (42.732673, -84.48...","[(42.7318544, -84.4885965), (42.7322603, -84.4...",College,"(42.7318544, -84.4885965)",32425.22145,105560.742195,137985.963645,0.234989,3.255513,"[297.82281375003566, 298.3655499704606, 298.91...",408.323717,297.822814,368.112429,372.005767,"POLYGON ((42.7318544 -84.4885965, 42.7322603 -...",0.000393,0.80514,35.916305,3.25646
35,White Lake Lakeland HS,"[(42.6216853, -83.5172179), (42.6225942, -83.5...","[(42.6216853, -83.5172179), (42.6217504, -83.5...",High School,"(42.6216853, -83.5172179)",34882.054211,101974.755893,136856.810105,0.25488,2.923416,"[330.6111733270755, 330.6494491567888, 330.691...",390.045702,329.425056,360.059291,360.116629,"POLYGON ((42.6216853 -83.5172179, 42.6217504 -...",0.002503,0.795357,35.426171,3.156855
24,Schoolcraft HS,"[(42.1252956, -85.6318087), (42.1261978, -85.6...","[(42.1252956, -85.6318087), (42.1253135, -85.6...",High School,"(42.1252956, -85.6318087)",34431.515668,100642.031824,135073.547493,0.254909,2.922963,"[328.46129418953126, 328.4645397646765, 328.47...",389.69828,328.461294,359.460408,359.784607,"POLYGON ((42.1252956 -85.6318087, 42.1253135 -...",0.00135,0.808003,36.745089,3.105296
44,U of M,"[(42.2678568, -83.7420742), (42.2670088, -83.7...","[(42.2678568, -83.7420742), (42.2678355, -83.7...",College,"(42.2678568, -83.7420742)",19408.615773,100786.703042,120195.318816,0.161476,5.192885,"[319.05826076083855, 319.2128497580065, 319.37...",394.974989,309.464144,358.410441,366.688602,"POLYGON ((42.2678568 -83.7420742, 42.2678355 -...",0.008232,0.820295,35.972984,2.877466


In [40]:
# Set the output directory for the Outfield Fence Plots

output_dir = 'TEMP/graph/2/'

def plot_distances(df, row_index):
    # Get rows with 'Min', 'Max', 'Mean', 'Q1', 'Q3' in 'park_name'
    rows_to_plot = df[df['park_name'].isin(['Min', 'Max', 'Mean', 'Q1', 'Q3'])]
    
    # Get the row to be highlighted
    highlighted_row = df.loc[row_index]
    
    # Create a new figure
    fig = plt.figure(figsize=(10,8))
    fig.patch.set_facecolor('black')
    

## NOT NEEDED YET - NO COLOR COLUMN
    # # Apply the validate_color function to the color2 column
    # df['color2'] = df['color2'].apply(validate_color)
    
    
    # Loop over these rows and plot a line graph for each
    for index, row in rows_to_plot.iterrows():
        if row['park_name'] in ['Q1', 'Q3']: # If Q1 or Q3, plot thinner, dotted line
            plt.plot(row['distances'], linestyle='dotted', alpha=0.3, color='grey', label=row['park_name'])
        else:
            plt.plot(row['distances'], linestyle='dashed', alpha=0.5, label=row['park_name'])

        # Add text labels for Min, Max and Mean lines
        if row['park_name'] in ['Min', 'Max', 'Mean']:
            plt.text(len(row['distances'])-1, row['distances'][-1], row['park_name'], color='blue', va='center')

        ## Shade the standard green below the Min Line
        # Check if the current row is 'Min', if so, add shading
        if row['park_name'] == 'Min':
            plt.fill_between(range(len(row['distances'])), plt.ylim()[0], row['distances'], color='green', alpha=0.4)

        # # SHADES IN TEAM COLOR BELOW MIN LINE
        # if row['park_name'] == 'Min' and pd.notnull(highlighted_row['color1']):
        #     plt.fill_between(range(len(row['distances'])), row['distances'], color=highlighted_row['color1'], alpha=0.4)

        # Check if the current row is 'Max', if so, add shading
        if row['park_name'] == 'Max':
            plt.fill_between(range(len(row['distances'])), row['distances'], color='yellow', alpha=0.2)

        ## SHADE IN USING TEAM COLORS ABOVE MAX LINE
        if row['park_name'] == 'Max':
            plt.fill_between(range(len(row['distances'])), plt.ylim()[1], row['distances'], color='red', alpha=1.0)

            
    # Plot the highlighted row with a thicker line
    plt.plot(highlighted_row['distances'], linewidth=2, label=highlighted_row['park_name'])

    
    
    # Set the minimum and maximum values of y-axis
    plt.ylim([300, 420])

    # set the font for all the labels
    plt.rcParams['font.family'] = 'sans-serif'

    # Change y-axis labels and tick marks to be white
    plt.ylabel('Distance (feet)', color='white', size=14)
    plt.tick_params(axis='y', colors='white', size=12)



    # Hide x axis ticks
    plt.xticks([])



    # Add a main title and subtitle with the field name and host school
    plt.suptitle(f"{highlighted_row['park_name']}", color='black', fontsize=20, y=.25)
    # plt.title(f"({highlighted_row['host_school']})", color='black', fontsize=16, y=.05)

    # Add text with statistics for each field
    shape_complexity = highlighted_row['shape_complexity']
    if not np.isnan(shape_complexity):
        plt.annotate(f"Complexity Rating: {(shape_complexity)}", xy=(0.05, 0.90), xycoords='axes fraction', color="black", fontsize=20, weight='bold', ha="left", va="bottom")

    elongation = highlighted_row['elongation']
    if not np.isnan(elongation):
        plt.annotate(f"Elongation: {elongation}", xy=(0.05, 0.75), xycoords='axes fraction', color="blue", fontsize=16,  ha="left", va="bottom")

    orientation = highlighted_row['orientation']
    if not np.isnan(orientation):
        plt.annotate(f"Orientation: {orientation}", xy=(0.05, 0.85), xycoords='axes fraction', color="blue", fontsize=12, ha="left", va="bottom")
    if not np.isnan(highlighted_row['orientation']):
        plt.annotate(f"Orientation: {orientation}", xy=(0.05, 0.70), xycoords='axes fraction', color="blue", fontsize=16, weight='bold', ha="left", va="bottom")


    
    #Games: {highlighted_row['games']} HR / G: {highlighted_row['home_runs_per_game']} ", xy=(0.05, 0.95), xycoords='axes fraction', color="white", fontsize=16, ha="left", va="bottom")


  

    # Reverse the x-axis
    plt.gca().invert_xaxis()

    # Generate the file path
    file_path = os.path.join(output_dir, f"HR SCORE {highlighted_row['score']} - {highlighted_row['park_name']}.png")
    
    # Save the figure
    plt.savefig(file_path, transparent=False)


    # Close the figure to free up memory
    plt.close()

    # Return the file path
    return file_path

# Add a new column 'file_path' to the DataFrame to store the file paths
df['file_path'] = [plot_distances(df, i) for i in df.index]





In [41]:
# df.tail(15)