In [23]:
##### ETL NOTEBOOK FOR 2023 MHSAA TOURNEY SPECIFIC MAP

#### Adapted from ETL for JSON

## Dependencies and Setup
### Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re
import time

## Start timer
start_time = time.time()


In [24]:
## LOAD BLOCK###
#### Load data from kml file exported from Google Earth

file_path = ('data/kml/MHSAA_2023.kml') # file path to kml file


# Read the KML file
with open(file_path) as file:
    xml_data = file.read()

# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')
folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')

# Create a list to store rows to append to the DataFrame
rows = []

# Loop through the folders and extract the data
for folder in list:
    try:
        field_name = folder.find('name').text
        foul = folder.find_all('coordinates')[0].text
        fop = folder.find_all('coordinates')[1].text
        notes = None

        # Check if there is a description tag, if so, use it for notes
        if folder.find('description') is not None:
            notes = folder.find('description').text

        row = {
            'field': field_name,
            'foul': foul,
            'fop': fop,
            'notes': notes
        }

        rows.append(row)

    except Exception as e:
        # Add name of folder to a list of failed folders
        failed.append(folder.find('name').text)
        print(f"Error processing folder: {folder.find('name').text}. Error message: {str(e)}")

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows, columns=['field', 'foul', 'fop', 'notes'])

# print('Failed to parse:', failed)


In [25]:
df.head()

Unnamed: 0,field,foul,fop,notes
0,Adams Butzel Complex,"\n\t\t\t\t\t\t\t\t-83.1678186,42.3966942,0 -83...","\n\t\t\t\t\t\t\t\t-83.1678186,42.3966942,0 -83...",
1,Adrian College,"\n\t\t\t\t\t\t\t\t-84.0697145,41.901861,0 -84....","\n\t\t\t\t\t\t\t\t-84.0697145,41.901861,0 -84....",
2,Adrian HS,"\n\t\t\t\t\t\t\t\t-84.0416584,41.9091676,0 -84...","\n\t\t\t\t\t\t\t\t-84.0416584,41.9091676,0 -84...",
3,Alcona HS,"\n\t\t\t\t\t\t\t\t-83.4068606,44.6597432,0 -83...","\n\t\t\t\t\t\t\t\t-83.4068606,44.6597432,0 -83...",tough treeline in center and left
4,Algonac High School,"\n\t\t\t\t\t\t\t\t-82.58239759999999,42.628620...","\n\t\t\t\t\t\t\t\t-82.58239759999999,42.628620...",


In [26]:
# Clean the new dataframe


# Create a copy of the original DataFrame
df_cleaned = df.copy()

# Remove new line and space characters from coordinates
df_cleaned = df_cleaned.replace(r'\n','', regex=True) 
df_cleaned = df_cleaned.replace(r'\t','', regex=True) 

# Drop any duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=['field'], keep='first')

# Drop any rows with empty fields
df_cleaned = df_cleaned[(df_cleaned != 0).all(1)]

In [27]:
df_cleaned.info()
df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 143
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   field   140 non-null    object
 1   foul    140 non-null    object
 2   fop     140 non-null    object
 3   notes   7 non-null      object
dtypes: object(4)
memory usage: 5.5+ KB


Unnamed: 0,field,foul,fop,notes
0,Adams Butzel Complex,"-83.1678186,42.3966942,0 -83.1678776,42.397648...","-83.1678186,42.3966942,0 -83.1665385,42.396724...",
1,Adrian College,"-84.0697145,41.901861,0 -84.0703958,41.9026485...","-84.0697145,41.901861,0 -84.0687248,41.9023461...",
2,Adrian HS,"-84.0416584,41.9091676,0 -84.04166909999999,41...","-84.0416584,41.9091676,0 -84.0405493,41.909184...",
3,Alcona HS,"-83.4068606,44.6597432,0 -83.40803409999999,44...","-83.4068606,44.6597432,0 -83.40680159999999,44...",tough treeline in center and left
4,Algonac High School,"-82.58239759999999,42.6286202,0 -82.5813153999...","-82.58239759999999,42.6286202,0 -82.5826256,42...",


In [28]:
##### Clean up polygon data and create a new home_plate column

def parse_coordinates(coord_string):
    coords = coord_string.split()
    parsed_coords = [tuple(map(float, coord.split(',')[:2])) for coord in coords]
    return parsed_coords

# Create a new column for the home_plate location using the first set of coordinates in the 'fop' column
df_cleaned['home_plate'] = df_cleaned['fop'].apply(lambda x: parse_coordinates(x)[0])

# Apply the parse_coordinates function to the 'foul' and 'fop' columns
df_cleaned['foul'] = df_cleaned['foul'].apply(parse_coordinates)
df_cleaned['fop'] = df_cleaned['fop'].apply(parse_coordinates)


In [29]:
############## AREA CALCULATION ##############


import pyproj
from shapely.geometry import Polygon
from shapely.ops import transform


def calculate_area(coords):
    # Create a Polygon object from the coordinates
    polygon = Polygon(coords)

    # Calculate the centroid of the polygon
    centroid = polygon.centroid

    # Create a custom LAEA projection centered on the centroid
    custom_projection = f"+proj=laea +lat_0={centroid.y} +lon_0={centroid.x} +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

    # Create a transformer for converting coordinates to the custom LAEA projection
    transformer = pyproj.Transformer.from_crs(
        pyproj.CRS("EPSG:4326"),  # WGS 84 (latitude and longitude)
        pyproj.CRS(custom_projection),  # Custom LAEA projection
        always_xy=True
    )

    # Define a function to transform coordinates using the transformer
    def transform_coordinates(x, y):
        return transformer.transform(x, y)

    # Convert the coordinates to the custom LAEA projection
    polygon_laea = transform(transform_coordinates, polygon)

    # Calculate the area in square meters
    area_sqm = polygon_laea.area

    # Convert the area to square feet (1 square meter = 10.764 square feet)
    area_sqft = area_sqm * 10.764

    return area_sqft



### Call Function and add to dataframe
df_cleaned['foul_area_sqft'] = df_cleaned['foul'].apply(calculate_area)
df_cleaned['fop_area_sqft'] = df_cleaned['fop'].apply(calculate_area)

## Calculate the total area of the field and the ratio of foul area to field area
df_cleaned['field_area_sqft'] = df_cleaned['foul_area_sqft'] + df_cleaned['fop_area_sqft']
## Percentage foul area
df_cleaned['foul_area_per'] = df_cleaned['foul_area_sqft'] / df_cleaned['field_area_sqft']
## Fair to Foul Ratio
df_cleaned['fair_to_foul'] = df_cleaned['fop_area_sqft'] / df_cleaned['foul_area_sqft']


In [30]:
############# FENCE DISTANCE CALCULATION #############

from geopy.distance import great_circle
import numpy as np

def interpolate_points(start, end, length_ratio):
    start_np = np.array(start)
    end_np = np.array(end)
    return tuple(start_np + (end_np - start_np) * length_ratio)

def calculate_distances(home_plate, outfield_coords, num_points=540):
    def is_same_point(point1, point2, tolerance=1e-6):
        return abs(point1[0] - point2[0]) < tolerance and abs(point1[1] - point2[1]) < tolerance

    home_plate_lat_lon = (home_plate[1], home_plate[0])
    distances = []

    # Calculate total line length
    total_length = 0
    segments = []
    for i in range(len(outfield_coords) - 1):
        start = outfield_coords[i]
        end = outfield_coords[i + 1]
        if not is_same_point(home_plate, start) and not is_same_point(home_plate, end):
            segment_length = great_circle((start[1], start[0]), (end[1], end[0])).feet
            segments.append((start, end, segment_length))
            total_length += segment_length

    # Calculate the distance between equally spaced points
    spacing = total_length / (num_points - 1)

    # Interpolate points and calculate distances
    current_length = 0
    segment_index = 0
    for i in range(num_points):
        while segment_index < len(segments) - 1 and current_length > segments[segment_index][2]:
            current_length -= segments[segment_index][2]
            segment_index += 1

        start, end, segment_length = segments[segment_index]
        length_ratio = current_length / segment_length
        point = interpolate_points(start, end, length_ratio)
        distance = round(great_circle(home_plate_lat_lon, (point[1], point[0])).feet)
        distances.append(distance)

        current_length += spacing

    return distances

# Calculate distances for each row
df_cleaned['distances'] = df_cleaned.apply(lambda row: calculate_distances(row['home_plate'], row['fop']), axis=1)

# Calculate max, min, and average distances for each row
df_cleaned['max_distance'] = df_cleaned['distances'].apply(max)
df_cleaned['min_distance'] = df_cleaned['distances'].apply(min)
df_cleaned['avg_distance'] = df_cleaned['distances'].apply(lambda distances: sum(distances) / len(distances))
# get the median distance
df_cleaned['median_distance'] = df_cleaned['distances'].apply(lambda distances: np.median(distances))


In [31]:
######## CHECK BLOCK ########

## Check how long the distance list is for each row
df_cleaned['num_distances'] = df_cleaned['distances'].apply(len)

## Print the value counts for the 'num_distances' column
df_cleaned['num_distances'].value_counts()

540    140
Name: num_distances, dtype: int64

In [32]:
######### NOT NECESSARY FOR THIS PROJECT ##########

# ### Get Geolocation of each field based on home plate coordinates and return state and country
# ### This block takes a long time to run - will need to revisit
# ## up to ten minutes

# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut, GeocoderServiceError
# from tqdm import tqdm

# geolocator = Nominatim(user_agent="baseball_field_locator")

# import time

# def get_location_info(lng, lat):
#     try:
#         time.sleep(1)  # Delay for 1 second
#         location = geolocator.reverse((lat, lng), timeout=10)
#         city = location.raw['address'].get('city', None)
#         state = location.raw['address'].get('state', None)
#         return city, state
#     except GeocoderTimedOut:
#         print(f"GeocoderTimedOut error for coordinates: ({lng}, {lat})")
#         return None, None
#     except GeocoderServiceError:
#         print(f"GeocoderServiceError for coordinates: ({lng}, {lat})")
#         return None, None


# # Extract the first coordinate for each field
# df_cleaned['lng'], df_cleaned['lat'] = zip(*df_cleaned['home_plate'].apply(lambda x: x))

# # Wrap the DataFrame apply function with tqdm for progress indication
# tqdm.pandas(desc="Processing coordinates")

# # Get state and country information for each field
# df_cleaned[['city', 'state']] = df_cleaned.progress_apply(lambda row: get_location_info(row['lng'], row['lat']), axis=1, result_type='expand')


In [33]:
## Function to create ranks for each column

def rank_fields(df):
    # Calculate the rank for each category
    df['max_distance_rank'] = df['max_distance'].rank(ascending=False, method='min')
    df['min_distance_rank'] = df['min_distance'].rank(ascending=False, method='min')
    df['avg_distance_rank'] = df['avg_distance'].rank(ascending=False, method='min')
    df['median_distance_rank'] = df['median_distance'].rank(ascending=False, method='min')
    df['field_area_rank'] = df['field_area_sqft'].rank(ascending=False, method='min')
    df['foul_area_rank'] = df['foul_area_sqft'].rank(ascending=False, method='min')
    df['fop_area_per_rank'] = df['fop_area_sqft'].rank(ascending=False, method='min')
    df['ratio_rank'] = df['fair_to_foul'].rank(ascending=False, method='min')

    return df



In [34]:
## Run Function

df_cleaned = rank_fields(df_cleaned)

In [35]:
#### Orienting the map to the home plate location ####

### Find the center of the field
def calculate_centroid(coords):
    x_coords = [coord[0] for coord in coords]
    y_coords = [coord[1] for coord in coords]
    centroid_x = sum(x_coords) / len(coords)
    centroid_y = sum(y_coords) / len(coords)
    return (centroid_x, centroid_y)


## Find the bearing between the home plate and the center of the field
import math

def calculate_bearing(point1, point2):
    lat1, lon1 = math.radians(point1[1]), math.radians(point1[0])
    lat2, lon2 = math.radians(point2[1]), math.radians(point2[0])

    d_lon = lon2 - lon1

    x = math.cos(lat2) * math.sin(d_lon)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(d_lon)

    bearing = math.degrees(math.atan2(x, y))
    bearing = (bearing + 360) % 360  # Normalize the bearing to the range [0, 360)

    return bearing

### Function to classify direction in laymans terms North, South, East, West, ect
def degrees_to_cardinal_direction(degrees):
    directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N']
    index = round(degrees / 45)
    return directions[index]


In [36]:
# Calculate the centroid of the outfield fence coordinates for each row
df_cleaned['fop_centroid'] = df_cleaned['fop'].apply(lambda coords: calculate_centroid(coords[1:]))

# Calculate the bearing between home plate and the centroid for each row
df_cleaned['field_orientation'] = df_cleaned.apply(lambda row: calculate_bearing(row['home_plate'], row['fop_centroid']), axis=1)

# Convert the bearing to a cardinal direction
df_cleaned['field_cardinal_direction'] = df_cleaned['field_orientation'].apply(degrees_to_cardinal_direction)

# rename 'field' to 'park_name'
df_cleaned.rename(columns={'field': 'park_name'}, inplace=True)





### All the geo transformation should take place above this

## starting the process of matching in data from other sources

In [37]:
# ## output to csv 
# df_cleaned.to_csv('data/fields_cleaned.csv', index=False)

In [38]:
# df_cleaned.info()

# # Load the host team info with nickname and team colors
path = 'data/MHSAA/2023_MHSAA_sites.csv'
df_hosts = pd.read_csv(path)

df_parks = df_cleaned

df_hosts.head()

park_df = df_parks
host_df = df_hosts
# df_hosts.info()

# # Merge the host team info with the field info
# df_cleaned = df_cleaned.merge(df_hosts, on='host_team', how='left')

In [39]:
# host_df.info()

## find the Lowell High School field
host_df[host_df['park_name'] == 'Lowell High School - high school']



# park_df.info()

## find the Lowell High School field
park_df[park_df['park_name'] == 'Lowell High School - high school']

Unnamed: 0,park_name,foul,fop,notes,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,min_distance_rank,avg_distance_rank,median_distance_rank,field_area_rank,foul_area_rank,fop_area_per_rank,ratio_rank,fop_centroid,field_orientation,field_cardinal_direction
81,Lowell High School - high school,"[(-85.3763871, 42.9571353), (-85.3768716, 42.9...","[(-85.3763871, 42.9571353), (-85.3752914, 42.9...",guestimate on the fenceline based on the mow p...,"(-85.3763871, 42.9571353)",29097.855748,99166.362021,128264.217769,0.226859,3.40803,...,29.0,20.0,19.0,24.0,56.0,19.0,72.0,"(-85.3761548818182, 42.95801321363636)",10.956002,N


In [40]:
### Simple Merge, should work because the park_name columns should match exactly
## Do not detroy any data
df_merged = park_df.merge(host_df, on='park_name', how='left')

In [41]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 140 non-null    object 
 1   foul                      140 non-null    object 
 2   fop                       140 non-null    object 
 3   notes                     7 non-null      object 
 4   home_plate                140 non-null    object 
 5   foul_area_sqft            140 non-null    float64
 6   fop_area_sqft             140 non-null    float64
 7   field_area_sqft           140 non-null    float64
 8   foul_area_per             140 non-null    float64
 9   fair_to_foul              140 non-null    float64
 10  distances                 140 non-null    object 
 11  max_distance              140 non-null    int64  
 12  min_distance              140 non-null    int64  
 13  avg_distance              140 non-null    float64
 14  median_dis

In [42]:
## Drop the rows with null values in the 'fop', 'foul' or 'home_plate' columns
df_merged.dropna(subset=['fop', 'foul', 'home_plate'], inplace=True)

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 140 non-null    object 
 1   foul                      140 non-null    object 
 2   fop                       140 non-null    object 
 3   notes                     7 non-null      object 
 4   home_plate                140 non-null    object 
 5   foul_area_sqft            140 non-null    float64
 6   fop_area_sqft             140 non-null    float64
 7   field_area_sqft           140 non-null    float64
 8   foul_area_per             140 non-null    float64
 9   fair_to_foul              140 non-null    float64
 10  distances                 140 non-null    object 
 11  max_distance              140 non-null    int64  
 12  min_distance              140 non-null    int64  
 13  avg_distance              140 non-null    float64
 14  median_dis

In [43]:
## Merge the display names into the dataframe

## Load the display names from csv
path = 'data/MHSAA/2023_district_teams.csv'
places_df = pd.read_csv(path)

places_df.head()

Unnamed: 0,Division,District,Host,Location,Teams
0,1,1,Marquette,North Marquette Fields,"['Alpena', 'Mount Pleasant', 'Traverse City Ce..."
1,1,2,Midland Dow,H H Dow High School - Baseball - Midland,"['Bay City Central', 'Bay City Western', 'Midl..."
2,1,3,Muskegon Mona Shores,Mona Shores Baseball Field (Baseball Field) - ...,"['Grand Haven', 'Grand Rapids Kenowa Hills', '..."
3,1,4,Grand Rapids Forest Hills Northern,FHN Stadium - Baseball - Grand Rapids,"['Cedar Springs', 'Grand Rapids Northview', 'G..."
4,1,5,Grandville,Grandville High School - Baseball,"['Byron Center', 'Grand Rapids Union', 'Jeniso..."


In [44]:
# ## Merge the display names into the dataframe
# ## merged_df column District, places_df column district
# df_merged = df_merged.merge(places_df, on='district', how='left')







In [45]:
df_merged.info()
df = df_merged

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 140 non-null    object 
 1   foul                      140 non-null    object 
 2   fop                       140 non-null    object 
 3   notes                     7 non-null      object 
 4   home_plate                140 non-null    object 
 5   foul_area_sqft            140 non-null    float64
 6   fop_area_sqft             140 non-null    float64
 7   field_area_sqft           140 non-null    float64
 8   foul_area_per             140 non-null    float64
 9   fair_to_foul              140 non-null    float64
 10  distances                 140 non-null    object 
 11  max_distance              140 non-null    int64  
 12  min_distance              140 non-null    int64  
 13  avg_distance              140 non-null    float64
 14  median_dis

In [46]:
### THIS BLOCK CREATES THE RANKING OF PITCHER VS HITTER FRIENDLY FIELDS
def rank_fields(data):
    # Define weights for each parameter
    weights = {
        'max_distance': -1, # negative weight since longer fences favor pitchers
        'min_distance': 1,  # positive weight since shorter fences favor hitters
        'avg_distance': -1, # negative weight since longer fences favor pitchers
        'median_distance': -1, # negative weight since longer fences favor pitchers
        'field_area_sqft': -1,  # negative weight since larger fields favor pitchers
        'fair_to_foul': -1,  # negative weight since larger ratio (more foul territory) favors pitchers
        'foul_area_sqft': -1, # negative weight since larger foul area favors pitchers
        'fop_area_sqft': -1, # negative weight since larger out of play area favors pitchers
    }

    # Standardize features (subtract mean and divide by standard deviation)
    standardized_data = data.copy()
    for column in weights.keys():
        standardized_data[column] = (standardized_data[column] - standardized_data[column].mean()) / standardized_data[column].std()

    # Calculate score for each field
    standardized_data['score'] = standardized_data.apply(lambda row: sum(row[param] * weight for param, weight in weights.items()), axis=1)

    # Save scores to original dataframe
    data['score'] = standardized_data['score']

    # Rank fields based on score (higher scores are more hitter-friendly)
    ranked_fields = data.sort_values('score', ascending=False)

    return ranked_fields

# Suppose 'df' is your DataFrame containing the field data
ranked_fields = rank_fields(df)
print(ranked_fields[['park_name', 'score']])


                              park_name      score
61                              Hart HS   9.912756
20        Central Lake HS - high_school   9.753206
74                          Kingston HS   9.386839
85                            Martin HS   8.412362
35             East Jackson High School   7.839091
..                                  ...        ...
118      Sagniaw Valley State - college  -9.514021
1                        Adrian College  -9.516080
88   Michigan State - Old College Field -10.451569
72          Kalamazoo College - college -10.731596
30                    Detroit Osborn HS -16.961198

[140 rows x 2 columns]


In [47]:
ranked_fields.info()
merged_df = ranked_fields

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 61 to 30
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 140 non-null    object 
 1   foul                      140 non-null    object 
 2   fop                       140 non-null    object 
 3   notes                     7 non-null      object 
 4   home_plate                140 non-null    object 
 5   foul_area_sqft            140 non-null    float64
 6   fop_area_sqft             140 non-null    float64
 7   field_area_sqft           140 non-null    float64
 8   foul_area_per             140 non-null    float64
 9   fair_to_foul              140 non-null    float64
 10  distances                 140 non-null    object 
 11  max_distance              140 non-null    int64  
 12  min_distance              140 non-null    int64  
 13  avg_distance              140 non-null    float64
 14  median_dis

In [48]:
df = merged_df

In [49]:

import webcolors

# Assuming df is your DataFrame and it has columns 'color1' and 'color2'

custom_colors = {
    'Maize': '#F2C649',
    'Columbia Blue': '#C4D8E2',
    'Carolina Blue': '#56A0D3',
    'Cardinal': '#C41E3A',
    'Burgundy': '#800020',
    'Forrest Green': '#18453B',
    'Forest Green': '#18453B',
    'Columbia': '#C4D8E2',
    'Royal': '#4169e1',
    'Royal Blue': '#4169e1',
    'Vegas Gold': '#C5B358',
    'Navy Blue': '#000080'
}

def convert_to_hex(color_name):
    if isinstance(color_name, str):  # Check if color_name is a string
        try:
            return webcolors.name_to_hex(color_name)
        except ValueError:
            return custom_colors.get(color_name, '#000000')  # default to black if color name not recognized
    else:
        return '#000000'  # default to black if color_name is not a string

# Convert the color columns to string and strip any trailing spaces
df['color1'] = df['color1'].astype(str).str.strip()
df['color2'] = df['color2'].astype(str).str.strip()

# Convert color names to hex values
df['color1'] = df['color1'].apply(convert_to_hex)
df['color2'] = df['color2'].apply(convert_to_hex)



In [50]:
## Recreate the division_final and level columns

## If division column is not null use that value as division_final. if it is null use the value in the regional_division column
df['division_final'] = df['division'].fillna(df['regional_div'])

## Create a level column based if the field hosts a district the value should be 1
## if region_semi_number is present assign level 2 and if region_final_number is present assign level 3
## if finals is present assign level 4
df['level'] = np.where(df['district'].notnull(), 1, 0)
df['level'] = np.where(df['region_semi_number'].notnull(), 2, df['level'])
df['level'] = np.where(df['region_final_quarter'].notnull(), 3, df['level'])
df['level'] = np.where(df['finals'].notnull(), 4, df['level'])

    



In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 61 to 30
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   park_name                 140 non-null    object 
 1   foul                      140 non-null    object 
 2   fop                       140 non-null    object 
 3   notes                     7 non-null      object 
 4   home_plate                140 non-null    object 
 5   foul_area_sqft            140 non-null    float64
 6   fop_area_sqft             140 non-null    float64
 7   field_area_sqft           140 non-null    float64
 8   foul_area_per             140 non-null    float64
 9   fair_to_foul              140 non-null    float64
 10  distances                 140 non-null    object 
 11  max_distance              140 non-null    int64  
 12  min_distance              140 non-null    int64  
 13  avg_distance              140 non-null    float64
 14  median_dis

In [52]:


df_merged = df


In [53]:
# ### Get me a list of every color name used in the color1 and color2 columns
# color_list = merged_df['color1'].append(merged_df['color2']).unique()
# color_list

In [54]:
### outpus csv to check
df_merged.to_csv('data/MHSAA_FINAL_TEST.csv', index=False)

### OUTPUT JSON TO USE IN MAP
df_merged.to_json('data/html/mhsaa/data/map.json', orient='records')


In [55]:
df_merged.head()

Unnamed: 0,park_name,foul,fop,notes,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,regional_div,region_final_quarter,finals,nickname,color1,color2,color3,score,division_final,level
61,Hart HS,"[(-86.3725369, 43.694431), (-86.3725369, 43.69...","[(-86.3725369, 43.694431), (-86.371409, 43.694...",,"(-86.3725369, 43.694431)",20362.716493,74084.002514,94446.719007,0.2156,3.638218,...,,,,Pirates,#ff0000,#ffffff,,9.912756,3.0,1
20,Central Lake HS - high_school,"[(-85.2690937, 45.0671277), (-85.2679149, 45.0...","[(-85.2690937, 45.0671277), (-85.269142, 45.06...",,"(-85.2690937, 45.0671277)",17031.960373,75950.501772,92982.462144,0.183174,4.459293,...,,,,Trojans,#4169e1,#ffa500,,9.753206,4.0,1
74,Kingston HS,"[(-83.1919884, 43.4105303), (-83.1919496, 43.4...","[(-83.1919884, 43.4105303), (-83.1930426, 43.4...",,"(-83.1919884, 43.4105303)",20220.684021,74843.976378,95064.660399,0.212705,3.701357,...,4.0,,,Cardinals,#ff0000,#000000,,9.386839,4.0,2
85,Martin HS,"[(-85.6333741, 42.5353963), (-85.6334251, 42.5...","[(-85.6333741, 42.5353963), (-85.6322717, 42.5...",,"(-85.6333741, 42.5353963)",25261.683727,78145.743178,103407.426905,0.244293,3.09345,...,,,,Clippers,#800000,#ffffff,,8.412362,4.0,1
35,East Jackson High School,"[(-84.343207, 42.2573129), (-84.3432365, 42.25...","[(-84.343207, 42.2573129), (-84.3420966, 42.25...",,"(-84.343207, 42.2573129)",33522.202877,77886.875612,111409.078489,0.300893,2.323441,...,,,,Trojans,#000080,#ffffff,,7.839091,4.0,1


In [56]:
df_merged.describe()

## Min and max values of score column
print(df_merged['score'].min())
print(df_merged['score'].max())

# lowest 10 scores with field names
df_merged.nsmallest(10, 'score')[['park_name', 'score']]

-16.96119754485339
9.91275550455552


Unnamed: 0,park_name,score
30,Detroit Osborn HS,-16.961198
72,Kalamazoo College - college,-10.731596
88,Michigan State - Old College Field,-10.451569
1,Adrian College,-9.51608
118,Sagniaw Valley State - college,-9.514021
25,CMU - college,-9.363523
38,Flushing HS,-7.540115
44,Gaylord High School - high school,-7.518583
99,Newaygo High School - high school,-7.256577
29,Cornerstone Baseball Field - college,-7.2564


In [57]:
df_merged[df_merged['park_name'] == 'Lowell High School - high school']

Unnamed: 0,park_name,foul,fop,notes,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,regional_div,region_final_quarter,finals,nickname,color1,color2,color3,score,division_final,level
81,Lowell High School - high school,"[(-85.3763871, 42.9571353), (-85.3768716, 42.9...","[(-85.3763871, 42.9571353), (-85.3752914, 42.9...",guestimate on the fenceline based on the mow p...,"(-85.3763871, 42.9571353)",29097.855748,99166.362021,128264.217769,0.226859,3.40803,...,,,,Red Arrows,#ff0000,#ffffff,,-5.20302,1.0,1


In [58]:
## Find Lowell High School in final Dataframe

df_parks[df_parks['park_name'] == 'Lowell High School - high school']


Unnamed: 0,park_name,foul,fop,notes,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,min_distance_rank,avg_distance_rank,median_distance_rank,field_area_rank,foul_area_rank,fop_area_per_rank,ratio_rank,fop_centroid,field_orientation,field_cardinal_direction
81,Lowell High School - high school,"[(-85.3763871, 42.9571353), (-85.3768716, 42.9...","[(-85.3763871, 42.9571353), (-85.3752914, 42.9...",guestimate on the fenceline based on the mow p...,"(-85.3763871, 42.9571353)",29097.855748,99166.362021,128264.217769,0.226859,3.40803,...,29.0,20.0,19.0,24.0,56.0,19.0,72.0,"(-85.3761548818182, 42.95801321363636)",10.956002,N


## END BLOCK


In [None]:
# ## Do a FUZZY MATCH OF DF_HOSTS AND DF_PARKS
# # Debugging step to check for non-string values in host_teams
# for team in host_teams:
#     if not isinstance(team, str):
#         print(f"Non-string value found in host_teams: {team}")

# # Debugging step to check for non-string values in park_names
# for park in park_names:
#     if not isinstance(park, str):
#         print(f"Non-string value found in park_names: {park}")

# # Continue with fuzzy matching if no non-string values are found
# matches = [(team, process.extractOne(team, park_names)) for team in host_teams]


In [None]:
# ## List the values from the team column

# print(len(df_hosts['team'].unique()))
# df_hosts['team'].unique()

In [None]:
df_parks = df_cleaned

In [None]:
parks_df = df_cleaned.copy()

parks_df.info()
host_df.info()

In [None]:
def merge_dataframes(parks_df, host_df, min_score=90):
    dict_list = []
    unmatched_rows = []
    # Drop NaN values in 'team' and 'park_name' before matching
    host_names = host_df.team.dropna().unique()
    for name in parks_df.park_name.dropna():  # ignore NaN values
        match = match_team(name, host_names, min_score)
        
        # If no match found, add to unmatched_rows and continue to next iteration
        if match[0] == "":
            unmatched_rows.append(name)
            continue

        dict_ = {}
        dict_.update({"park_name_parks" : name})
        dict_.update({"match_name_host" : match[0]})
        dict_.update({"score" : match[1]})
        dict_list.append(dict_)

    merge_table = pd.DataFrame(dict_list)
    
    # Remove duplicates in merge_table, keeping only the row with the highest score
    merge_table = merge_table.sort_values('score', ascending=False).drop_duplicates(['park_name_parks'], keep='first')

    if 'match_name_host' in merge_table.columns:
        merged_df = pd.merge(parks_df, merge_table, left_on='park_name', right_on='park_name_parks', how='left')
        merged_df = pd.merge(merged_df, host_df, left_on='match_name_host', right_on='team', how='left')
    else:
        print("No matches found.")
        merged_df = None

    return merged_df, unmatched_rows


In [None]:
# merged_df = merged_df.sort_values('score', ascending=False).drop_duplicates(subset=['park_name', 'team'])


In [None]:
merged_df, unmatched_rows = merge_dataframes(parks_df, host_df, min_score=90)

merged_df.info()

In this function, we first match each team name in host_df with the park names in parks_df. If a match with a similarity score greater than the threshold (85 in your case) is found, we record the match in merge_table. If no match is found, we record the team name in unmatched_rows. After going through all team names, we merge host_df and parks_df based on the matches in merge_table.

The function merge_dataframes returns two objects. The first object, merged_df, is a DataFrame that contains the merged data. The second object, unmatched_rows, is a list of team names in host_df for which no match in parks_df could be found. You can inspect unmatched_rows to see which rows couldn't be matched.

In [None]:
def merge_dataframes(parks_df, host_df, min_score=90):
    dict_list = []
    unmatched_rows = []
    # Drop NaN values in 'team' and 'park_name' before matching
    host_names = host_df.team.dropna().unique()
    for name in parks_df.park_name.dropna():  # ignore NaN values
        match = match_team(name, host_names, min_score)
        
        # If no match found, add to unmatched_rows and continue to next iteration
        if match[0] == "":
            unmatched_rows.append(name)
            continue

        dict_ = {}
        dict_.update({"park_name_parks" : name})
        dict_.update({"match_name_host" : match[0]})
        dict_.update({"score" : match[1]})
        dict_list.append(dict_)

    merge_table = pd.DataFrame(dict_list)
    
    # Remove duplicates in merge_table, keeping only the row with the highest score
    merge_table = merge_table.sort_values('score', ascending=False).drop_duplicates(['park_name_parks'], keep='first')

    if 'match_name_host' in merge_table.columns:
        merged_df = pd.merge(parks_df, merge_table, left_on='park_name', right_on='park_name_parks', how='left')
        merged_df = pd.merge(merged_df, host_df, left_on='match_name_host', right_on='team', how='left')
    else:
        print("No matches found.")
        merged_df = None

    return merged_df, unmatched_rows



In [None]:
merged_df.info()

In [None]:
merged_df.head()




In [None]:
merged_df.head()

In [None]:
merged_df.info()

## WORKING HERE DOWN

In [None]:
## Matching Function to compair host names to park names 

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def find_host_matches(host_df, parks_df):
    # Initialize empty lists to store the matches and unmatched park names
    matches = []
    unmatched_park_names = []

    # Iterate over each host and district in the host_df
    for host, district in zip(host_df['team'], host_df['district']):
        # Use fuzzy matching to find potential matches in park names
        potential_matches = process.extractBests(host, parks_df['park_name'], scorer=fuzz.token_set_ratio, score_cutoff=80)

        # Store the host, district, and potential matches with their scores
        matches.append({'host': host, 'district': district, 'potential_matches': potential_matches})

        # Check if any strong matches were found
        if len(potential_matches) > 0:
            max_score = max(potential_matches, key=lambda x: x[1])[1]
            if max_score >= 80:
                continue

        # If no strong matches were found, add the park name to unmatched list
        unmatched_park_names.append((host, district))

    # Create a dataframe from the matches list
    matches_df = pd.DataFrame(matches)

    # Count the number of strong matches and unmatched park names
    strong_matches_count = matches_df['potential_matches'].apply(lambda x: sum(match[1] >= 80 for match in x)).sum()
    unmatched_count = len(unmatched_park_names)

    print("Number of strong matches:", strong_matches_count)
    print("Number of unmatched park names:", unmatched_count)
    print("Unmatched park names:", unmatched_park_names)

    return matches_df




In [None]:
host_df.info()
parks_df.info()

In [None]:
# Call the function to find host matches
result_df = find_host_matches(host_df, parks_df)

# # Merge the result_df to messe_df on district number
# merged_df = result_df.merge(messe_df, left_on='district', right_on='district number')

# # Select the desired column'region_semi_numbers
# merged_df = merged_df[['host', 'division', 'district', 'region_semi_number', 'region_final_quarter', 'finals', 'potential_matches', 'Plot Note', 'Map Link MHSAA']]
# merged_df.head()
# # merged_df.info()

In [None]:
### Keep only the field name from the potential matches, ignore if the potential match is empty
merged_df['potential_matches'] = merged_df['potential_matches'].apply(lambda x: x[0][0] if len(x) > 0 else np.nan)


# merged_df['potential_matches'] = merged_df['potential_matches'].apply(lambda x: x[0][0])

# Rename the potential_matches column to park_name

merged_df.rename(columns={'potential_matches': 'park_name'}, inplace=True)


merged_df.head()
merged_df.info()
# # Merge the merged_df to the parks_df on park_name

# parks_df = parks_df.merge(merged_df, on='park_name', how='left')



In [None]:
## Merge the merged_df to the parks_df on park_name
parks_df = parks_df.merge(merged_df, on='park_name', how='left')

parks_df.head()


In [None]:
parks_df.info()
parks_df.head()

# host_df.info()
# host_df.head()

# From here down are simple plots to do spot check of data and hold example of polar chart

### FILL IN THE REST OF JSON WITH THE DATA FOR THE 2023 TOURNEY

In [None]:
### Pull the host team information into the parks_df
parks_df = parks_df.merge(host_df, on='host', how='left')

### Create columns for tournament levels
# District is done, Need regional semi, regional final, quarter final, final_four





parks_df.info()

In [None]:
import os

# Build the file path using os.path.join
file_path = os.path.join('data', 'html', 'mhsaa', 'data', 'tourney_2023.json')

# Save the dataframe to JSON using the constructed file path
parks_df.to_json(file_path, orient='records')


In [None]:
from matplotlib import pyplot as plt

In [None]:
# Histogram of the max distance, min distance, average distance, and median distance

fig, ax = plt.subplots(2, 2, figsize=(12, 8))

ax[0, 0].hist(df_cleaned['max_distance'], bins=20)

ax[0, 1].hist(df_cleaned['min_distance'], bins=20)

ax[1, 0].hist(df_cleaned['avg_distance'], bins=20)

ax[1, 1].hist(df_cleaned['median_distance'], bins=20)

ax[0, 0].set_title('Max Distance')
ax[0, 1].set_title('Min Distance')

ax[1, 0].set_title('Average Distance')
ax[1, 1].set_title('Median Distance')

plt.show()


In [None]:
## Compile a list of fields that are outliers

outlier_fields = df_cleaned[(df_cleaned['max_distance'] > 400) | (df_cleaned['min_distance'] < 200) | (df_cleaned['avg_distance'] > 400) | (df_cleaned['median_distance'] > 400)]

len(outlier_fields)

print(outlier_fields['park_name'].values)



In [None]:
# Create list of the top and bottom ten from each category

top_ten_max = df_cleaned.sort_values(by='max_distance', ascending=False).head(10)
top_ten_min = df_cleaned.sort_values(by='min_distance', ascending=True).head(10)

top_ten_avg = df_cleaned.sort_values(by='avg_distance', ascending=False).head(10)
top_ten_median = df_cleaned.sort_values(by='median_distance', ascending=False).head(10)

top_ten_field_area = df_cleaned.sort_values(by='field_area_sqft', ascending=False).head(10)

top_ten_foul_area = df_cleaned.sort_values(by='foul_area_sqft', ascending=False).head(10)

top_ten_fop_area = df_cleaned.sort_values(by='fop_area_sqft', ascending=False).head(10)

top_ten_ratio = df_cleaned.sort_values(by='fair_to_foul', ascending=False).head(10)

bottom_ten_ratio = df_cleaned.sort_values(by='fair_to_foul', ascending=True).head(10)

bottom_ten_max = df_cleaned.sort_values(by='max_distance', ascending=True).head(10)
bottom_ten_min = df_cleaned.sort_values(by='min_distance', ascending=False).head(10)
bottom_ten_avg = df_cleaned.sort_values(by='avg_distance', ascending=True).head(10)
bottom_ten_median = df_cleaned.sort_values(by='median_distance', ascending=True).head(10)


### Create and display a dataframe with columns for the top and bottom ten fields for each category

top_bottom_df = pd.DataFrame()

top_bottom_df['top_ten_max'] = top_ten_max['park_name'].values
top_bottom_df['top_ten_min'] = top_ten_min['park_name'].values
top_bottom_df['top_ten_avg'] = top_ten_avg['park_name'].values

top_bottom_df['top_ten_median'] = top_ten_median['park_name'].values
top_bottom_df['top_ten_field_area'] = top_ten_field_area['park_name'].values
top_bottom_df['top_ten_foul_area'] = top_ten_foul_area['park_name'].values
top_bottom_df['top_ten_fop_area'] = top_ten_fop_area['park_name'].values
top_bottom_df['top_ten_ratio'] = top_ten_ratio['park_name'].values

top_bottom_df['bottom_ten_ratio'] = bottom_ten_ratio['park_name'].values
top_bottom_df['bottom_ten_max'] = bottom_ten_max['park_name'].values
top_bottom_df['bottom_ten_min'] = bottom_ten_min['park_name'].values
top_bottom_df['bottom_ten_avg'] = bottom_ten_avg['park_name'].values
top_bottom_df['bottom_ten_median'] = bottom_ten_median['park_name'].values


top_bottom_df.head(10)




In [None]:
### NEW WITH AUTO SCALING

def calculate_max_y(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    return max(bin_counts)


def create_polar_chart(data, num_bins=36, level_filter=None, y_min=-20, background_color='#2b2b2b', color_map=plt.cm.viridis, bar_alpha=0.8):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    # ax.set_yscale('log')

    # Set dark background
    ax.set_facecolor(background_color)
    plt.gca().set_rlabel_position(22.5)
    y_max = calculate_max_y(data, num_bins=num_bins, level_filter=level_filter) + 5
    ax.set_ylim(y_min, y_max)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(color_map(r / max(bin_counts)))
        bar.set_alpha(bar_alpha)

    plt.show()


In [None]:
##### CALL AUTO ADJUSTING CHART #####


## NEW PERAMS


# Call your function
create_polar_chart(
    data, 
    num_bins=30, 
    # level_filter="level1", 
    y_min=0, 
    background_color='#2b2b2b', 
    color_map=plt.cm.viridis, 
    bar_alpha=0.8
)


In [None]:
## NEW CHAT GPT CODE

def create_polar_chart(data, num_bins=36, level_filter=None, y_min=-20, y_max=130, background_color='#2b2b2b', color_map=plt.cm.viridis, bar_alpha=0.8):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    # ax.set_yscale('log')

    # Set dark background
    ax.set_facecolor(background_color)
    plt.gca().set_rlabel_position(22.5)
    ax.set_ylim(y_min, y_max)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(color_map(r / max(bin_counts)))
        bar.set_alpha(bar_alpha)

    plt.show()


In [None]:
### Create a polar chart showing the direction of all the tournment fields


import numpy as np
import matplotlib.pyplot as plt

# create a function to process the data, counting the orientations and filtering by level.

from collections import defaultdict

def process_data(data, level_filter=None):
    count_by_orientation = defaultdict(int)
    
    for record in data:
        if level_filter is None or record['level'] == level_filter:
            orientation = round(record['field_orientation'])
            count_by_orientation[orientation] += 1

    return count_by_orientation

def create_polar_chart(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    ###
    # ax.set_yscale('log')

    
    # # Set dark background
    ax.set_facecolor('#2b2b2b')
    plt.gca().set_rlabel_position(22.5)
    ax.set_ylim(-20, 130)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(plt.cm.viridis(r / max(bin_counts)))
        # bar.set_facecolor(plt.cm.plasma(r / max(bin_counts)))
        bar.set_alpha(0.8)

    plt.show()

In [None]:
create_polar_chart(data, num_bins=50, level_filter=None)