In [1]:
##### ETL NOTEBOOK FOR 2023 MHSAA TOURNEY SPECIFIC MAP

#### Adapted from ETL for JSON

## Dependencies and Setup
### Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re
import time
import matplotlib.pyplot as plt

## Start timer
start_time = time.time()


In [2]:
## LOAD BLOCK###
#### Load data from kml file exported from Google Earth

file_path = ('data/kml/MHSAA_2023.kml') # file path to kml file


# Read the KML file
with open(file_path) as file:
    xml_data = file.read()

# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')
folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')

# Create a list to store rows to append to the DataFrame
rows = []

# Loop through the folders and extract the data
for folder in list:
    try:
        field_name = folder.find('name').text
        foul = folder.find_all('coordinates')[0].text
        fop = folder.find_all('coordinates')[1].text
        notes = None

        # Check if there is a description tag, if so, use it for notes
        if folder.find('description') is not None:
            notes = folder.find('description').text

        row = {
            'field': field_name,
            'foul': foul,
            'fop': fop,
            'notes': notes
        }

        rows.append(row)

    except Exception as e:
        # Add name of folder to a list of failed folders
        failed.append(folder.find('name').text)
        print(f"Error processing folder: {folder.find('name').text}. Error message: {str(e)}")

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows, columns=['field', 'foul', 'fop', 'notes'])


In [3]:
df.head()

Unnamed: 0,field,foul,fop,notes
0,Adams Butzel Complex,"\n\t\t\t\t\t\t\t\t-83.1678186,42.3966942,0 -83...","\n\t\t\t\t\t\t\t\t-83.1678186,42.3966942,0 -83...",
1,Adrian HS,"\n\t\t\t\t\t\t\t\t-84.0416584,41.9091676,0 -84...","\n\t\t\t\t\t\t\t\t-84.0416584,41.9091676,0 -84...",
2,Alcona HS,"\n\t\t\t\t\t\t\t\t-83.4068606,44.6597432,0 -83...","\n\t\t\t\t\t\t\t\t-83.4068606,44.6597432,0 -83...",tough treeline in center and left
3,Algonac High School,"\n\t\t\t\t\t\t\t\t-82.58239759999999,42.628620...","\n\t\t\t\t\t\t\t\t-82.58239759999999,42.628620...",
4,Allen Park High School,"\n\t\t\t\t\t\t\t\t-83.2273711,42.2455509,0 -83...","\n\t\t\t\t\t\t\t\t-83.2273711,42.2455509,0 -83...",


In [4]:
# Clean the new dataframe


# Create a copy of the original DataFrame
df_cleaned = df.copy()

# Remove new line and space characters from coordinates
df_cleaned = df_cleaned.replace(r'\n','', regex=True) 
df_cleaned = df_cleaned.replace(r'\t','', regex=True) 

# Drop any duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=['field'], keep='first')

# Drop any rows with empty fields
df_cleaned = df_cleaned[(df_cleaned != 0).all(1)]

In [5]:
df_cleaned.info()
df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 143
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   field   140 non-null    object
 1   foul    140 non-null    object
 2   fop     140 non-null    object
 3   notes   7 non-null      object
dtypes: object(4)
memory usage: 5.5+ KB


Unnamed: 0,field,foul,fop,notes
0,Adams Butzel Complex,"-83.1678186,42.3966942,0 -83.1678776,42.397648...","-83.1678186,42.3966942,0 -83.1665385,42.396724...",
1,Adrian HS,"-84.0416584,41.9091676,0 -84.04166909999999,41...","-84.0416584,41.9091676,0 -84.0405493,41.909184...",
2,Alcona HS,"-83.4068606,44.6597432,0 -83.40803409999999,44...","-83.4068606,44.6597432,0 -83.40680159999999,44...",tough treeline in center and left
3,Algonac High School,"-82.58239759999999,42.6286202,0 -82.5813153999...","-82.58239759999999,42.6286202,0 -82.5826256,42...",
4,Allen Park High School,"-83.2273711,42.2455509,0 -83.2285244,42.245525...","-83.2273711,42.2455509,0 -83.22739919999999,42...",


In [6]:
##### Clean up polygon data and create a new home_plate column

def parse_coordinates(coord_string):
    coords = coord_string.split()
    parsed_coords = [tuple(map(float, coord.split(',')[:2])) for coord in coords]
    return parsed_coords

# Create a new column for the home_plate location using the first set of coordinates in the 'fop' column
df_cleaned['home_plate'] = df_cleaned['fop'].apply(lambda x: parse_coordinates(x)[0])

# Apply the parse_coordinates function to the 'foul' and 'fop' columns
df_cleaned['foul'] = df_cleaned['foul'].apply(parse_coordinates)
df_cleaned['fop'] = df_cleaned['fop'].apply(parse_coordinates)


In [7]:
############## AREA CALCULATION ##############


import pyproj
from shapely.geometry import Polygon
from shapely.ops import transform


def calculate_area(coords):
    # Create a Polygon object from the coordinates
    polygon = Polygon(coords)

    # Calculate the centroid of the polygon
    centroid = polygon.centroid

    # Create a custom LAEA projection centered on the centroid
    custom_projection = f"+proj=laea +lat_0={centroid.y} +lon_0={centroid.x} +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

    # Create a transformer for converting coordinates to the custom LAEA projection
    transformer = pyproj.Transformer.from_crs(
        pyproj.CRS("EPSG:4326"),  # WGS 84 (latitude and longitude)
        pyproj.CRS(custom_projection),  # Custom LAEA projection
        always_xy=True
    )

    # Define a function to transform coordinates using the transformer
    def transform_coordinates(x, y):
        return transformer.transform(x, y)

    # Convert the coordinates to the custom LAEA projection
    polygon_laea = transform(transform_coordinates, polygon)

    # Calculate the area in square meters
    area_sqm = polygon_laea.area

    # Convert the area to square feet (1 square meter = 10.764 square feet)
    area_sqft = area_sqm * 10.764

    return area_sqft



### Call Function and add to dataframe
df_cleaned['foul_area_sqft'] = df_cleaned['foul'].apply(calculate_area)
df_cleaned['fop_area_sqft'] = df_cleaned['fop'].apply(calculate_area)

## Calculate the total area of the field and the ratio of foul area to field area
df_cleaned['field_area_sqft'] = df_cleaned['foul_area_sqft'] + df_cleaned['fop_area_sqft']
## Percentage foul area
df_cleaned['foul_area_per'] = df_cleaned['foul_area_sqft'] / df_cleaned['field_area_sqft']
## Fair to Foul Ratio
df_cleaned['fair_to_foul'] = df_cleaned['fop_area_sqft'] / df_cleaned['foul_area_sqft']


In [8]:
############# FENCE DISTANCE CALCULATION #############

from geopy.distance import great_circle
import numpy as np



def interpolate_points(start, end, length_ratio):
    start_np = np.array(start)
    end_np = np.array(end)
    return tuple(start_np + (end_np - start_np) * length_ratio)

def calculate_distances(home_plate, outfield_coords, num_points=540):
    def is_same_point(point1, point2, tolerance=1e-6):
        return abs(point1[0] - point2[0]) < tolerance and abs(point1[1] - point2[1]) < tolerance

    home_plate_lat_lon = (home_plate[1], home_plate[0])
    distances = []

    # Calculate total line length
    total_length = 0
    segments = []
    for i in range(len(outfield_coords) - 1):
        start = outfield_coords[i]
        end = outfield_coords[i + 1]
        if not is_same_point(home_plate, start) and not is_same_point(home_plate, end):
            segment_length = great_circle((start[1], start[0]), (end[1], end[0])).feet
            segments.append((start, end, segment_length))
            total_length += segment_length

    # Calculate the distance between equally spaced points
    spacing = total_length / (num_points - 1)

    # Interpolate points and calculate distances
    current_length = 0
    segment_index = 0
    for i in range(num_points):
        while segment_index < len(segments) - 1 and current_length > segments[segment_index][2]:
            current_length -= segments[segment_index][2]
            segment_index += 1

        start, end, segment_length = segments[segment_index]
        length_ratio = current_length / segment_length
        point = interpolate_points(start, end, length_ratio)
        distance = great_circle(home_plate_lat_lon, (point[1], point[0])).feet
        distances.append(distance)

        current_length += spacing

    return distances

# Calculate distances for each row
df_cleaned['distances'] = df_cleaned.apply(lambda row: calculate_distances(row['home_plate'], row['fop']), axis=1)

# Calculate max, min, and average distances for each row
df_cleaned['max_distance'] = df_cleaned['distances'].apply(max)
df_cleaned['min_distance'] = df_cleaned['distances'].apply(min)
df_cleaned['avg_distance'] = df_cleaned['distances'].apply(lambda distances: sum(distances) / len(distances))
# get the median distance
df_cleaned['median_distance'] = df_cleaned['distances'].apply(lambda distances: np.median(distances))


In [9]:
######## CHECK BLOCK ########

## Check how long the distance list is for each row
df_cleaned['num_distances'] = df_cleaned['distances'].apply(len)

## Print the value counts for the 'num_distances' column
df_cleaned['num_distances'].value_counts()

540    140
Name: num_distances, dtype: int64

In [10]:
## Function to create ranks for each column

def rank_fields(df):
    # Calculate the rank for each category
    df['max_distance_rank'] = df['max_distance'].rank(ascending=False, method='min')
    df['min_distance_rank'] = df['min_distance'].rank(ascending=False, method='min')
    df['avg_distance_rank'] = df['avg_distance'].rank(ascending=False, method='min')
    df['median_distance_rank'] = df['median_distance'].rank(ascending=False, method='min')
    df['field_area_rank'] = df['field_area_sqft'].rank(ascending=False, method='min')
    df['foul_area_rank'] = df['foul_area_sqft'].rank(ascending=False, method='min')
    df['fop_area_per_rank'] = df['fop_area_sqft'].rank(ascending=False, method='min')
    df['ratio_rank'] = df['fair_to_foul'].rank(ascending=False, method='min')

    return df



In [11]:
## Run Function

df_cleaned = rank_fields(df_cleaned)

In [12]:
#### Orienting the map to the home plate location ####

### Find the center of the field
def calculate_centroid(coords):
    x_coords = [coord[0] for coord in coords]
    y_coords = [coord[1] for coord in coords]
    centroid_x = sum(x_coords) / len(coords)
    centroid_y = sum(y_coords) / len(coords)
    return (centroid_x, centroid_y)


## Find the bearing between the home plate and the center of the field
import math

def calculate_bearing(point1, point2):
    lat1, lon1 = math.radians(point1[1]), math.radians(point1[0])
    lat2, lon2 = math.radians(point2[1]), math.radians(point2[0])

    d_lon = lon2 - lon1

    x = math.cos(lat2) * math.sin(d_lon)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(d_lon)

    bearing = math.degrees(math.atan2(x, y))
    bearing = (bearing + 360) % 360  # Normalize the bearing to the range [0, 360)

    return bearing

### Function to classify direction in laymans terms North, South, East, West, ect
def degrees_to_cardinal_direction(degrees):
    directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N']
    index = round(degrees / 45)
    return directions[index]


In [13]:
# Calculate the centroid of the outfield fence coordinates for each row
df_cleaned['fop_centroid'] = df_cleaned['fop'].apply(lambda coords: calculate_centroid(coords[1:]))

# Calculate the bearing between home plate and the centroid for each row
df_cleaned['field_orientation'] = df_cleaned.apply(lambda row: calculate_bearing(row['home_plate'], row['fop_centroid']), axis=1)

# Convert the bearing to a cardinal direction
df_cleaned['field_cardinal_direction'] = df_cleaned['field_orientation'].apply(degrees_to_cardinal_direction)

# rename 'field' to 'park_name'
df_cleaned.rename(columns={'field': 'park_name'}, inplace=True)





In [14]:
## Need to rename dataframe to df for this block 

df = df_cleaned

### Get the Altitiude of each field as well as city and state
### This block will take a while to run, can process about 2 seconds per record

## Get Altitudes of the ballparks
import requests
import pandas as pd
import time
from tqdm import tqdm
from geopy.geocoders import Nominatim

# Set your Google Maps API key here
api_key = 'AIzaSyA_BhlTupRdBPBhRptQuR6pYorMVYQnRMA'

# Get the altitude of a location from its latitude and longitude
def get_altitude(lat, lon):
    query = f'https://maps.googleapis.com/maps/api/elevation/json?locations={lat},{lon}&key={api_key}'
    r = requests.get(query).json()
    elevation = r['results'][0]['elevation']
    return elevation

# Get the city and state of a location from its latitude and longitude
def get_city_state(lat, lon):
    query = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={api_key}'
    r = requests.get(query).json()
    results = r['results'][0]['address_components']
    city = next((item['long_name'] for item in results if 'locality' in item['types']), '')
    state = next((item['long_name'] for item in results if 'administrative_area_level_1' in item['types']), '')
    return city, state

# Initialize empty lists for the new columns
altitudes = []
cities = []
states = []

# Loop through each row in the dataframe
for coords in tqdm(df['home_plate']):
    # Get altitude and add to list
    altitude = get_altitude(coords[1], coords[0])
    altitudes.append(altitude)

    # Get city and state and add to lists
    city, state = get_city_state(coords[1], coords[0])
    cities.append(city)
    states.append(state)

    # Sleep for a bit to avoid hitting rate limits
    time.sleep(1)  # Adjust this value as needed

# Add the new columns to the dataframe
df['altitude'] = altitudes
df['city'] = cities
df['state'] = states


 14%|█▍        | 20/140 [00:39<04:14,  2.12s/it]

### All the geo transformation should take place above this

## starting the process of matching in data from other sources

In [None]:
### sAVE THE CITY STATE AND ALTITUDE TO A CSV SO i CAN REFERENCE IT AND SKIP THE STEP

df.to_csv('data/2023_mhsaa_POST_LOOKUP.csv', index=False)


In [None]:
## Rename back to df_cleaned to continue with the following blocks
df_cleaned = df

# ## output to csv 
# df_cleaned.to_csv('data/fields_cleaned.csv', index=False)

In [None]:
# df_cleaned.info()

# # Load the host team info with nickname and team colors
path = 'data/MHSAA/2023_MHSAA_sites.csv'
df_hosts = pd.read_csv(path)

df_parks = df_cleaned

df_hosts.head()

park_df = df_parks
host_df = df_hosts
# df_hosts.info()

# # Merge the host team info with the field info
# df_cleaned = df_cleaned.merge(df_hosts, on='host_team', how='left')

In [None]:
# host_df.info()

## find the Lowell High School field
host_df[host_df['park_name'] == 'Lowell High School - high school']



# park_df.info()

## find the Lowell High School field
park_df[park_df['park_name'] == 'Lowell High School - high school']

In [None]:
### Simple Merge, should work because the park_name columns should match exactly
## Do not detroy any data
df_merged = park_df.merge(host_df, on='park_name', how='left')

In [None]:
## Drop index 1 and 137 (Osborn and Concorida) because they screw up the graphs
df_merged = df_merged.drop([1,137])


In [None]:
## Drop Osbourn - MAX Outlier - plot is correct it is just a strange fenceless field
# df_merged.drop([28], inplace=True)

# Drop AA Greenhills because something is wrong witht the plot
# Division is hosted at Concordia University AA - I have that ploted but it is not apearing in the data
# df_merged.drop([139], inplace=True) 

## Reset index
df_merged.reset_index(drop=True, inplace=True)




In [None]:
df_merged.info()

In [None]:
# Rename to use next block

df = df_merged

### Updated to create Standard Div +/- Lines

## create the min max and mean fence distance rows
# Transpose the dataframe to get the 
transposed_df = pd.DataFrame(df['distances'].to_list()).transpose()

# Calculate min, max, mean, median, Q1 and Q3 for each row
min_fence_distances = transposed_df.min(axis=1)
max_fence_distances = transposed_df.max(axis=1)
mean_fence_distances = transposed_df.mean(axis=1)
median_fence_distances = transposed_df.median(axis=1)
## create profiles for standard deviation
std_fence_distances = transposed_df.std(axis=1)
first_fence_distances = mean_fence_distances + std_fence_distances
third_fence_distances = mean_fence_distances - std_fence_distances

# Create a new DataFrame to store these values
new_df = pd.DataFrame({
    'park_name': ['Min', 'Max', 'Mean', 'Median', 'Q1', 'Q3'],
    'distances': [
        min_fence_distances.tolist(), 
        max_fence_distances.tolist(),
        mean_fence_distances.tolist(),
        median_fence_distances.tolist(), # Add a comma here
        first_fence_distances.tolist(),
        third_fence_distances.tolist()
    ]
})

# For all other columns in the original DataFrame, add a column of NaN values in the new DataFrame
for column in df.columns:
    if column not in new_df.columns:
        new_df[column] = np.nan

# Concatenate the new DataFrame with the original one
df = pd.concat([df, new_df], ignore_index=True)

In [None]:
# ## Add Osbourn back to the end of the dataframe
# df = df.append(park_df.iloc[1])
# df = df.append(park_df.iloc[137])

# # df.info()

In [None]:
# ## Merge the display names into the dataframe

# ## Load the display names from csv
# path = 'data/MHSAA/2023_district_teams.csv'
# places_df = pd.read_csv(path)

# places_df.head()

In [None]:
# ## Rename District to district
# places_df.rename(columns={'District': 'district'}, inplace=True)

# ## Merge the display names into the dataframe
# ## merged_df column District, places_df column district
# df_merged = df_merged.merge(places_df, on='district', how='left')







In [None]:
### THIS BLOCK CREATES THE RANKING OF PITCHER VS HITTER FRIENDLY FIELDS
def rank_fields(data):
    # Define weights for each parameter
    weights = {
        'max_distance': -1, # negative weight since longer fences favor pitchers
        'min_distance': 1,  # positive weight since shorter fences favor hitters
        'avg_distance': -1, # negative weight since longer fences favor pitchers
        'median_distance': -1, # negative weight since longer fences favor pitchers
        'field_area_sqft': -1,  # negative weight since larger fields favor pitchers
        'fair_to_foul': -1,  # negative weight since larger ratio (more foul territory) favors pitchers
        'foul_area_sqft': -1, # negative weight since larger foul area favors pitchers
        'fop_area_sqft': -1, # negative weight since larger out of play area favors pitchers
    }

    # Standardize features (subtract mean and divide by standard deviation)
    standardized_data = data.copy()
    for column in weights.keys():
        standardized_data[column] = (standardized_data[column] - standardized_data[column].mean()) / standardized_data[column].std()

    # Calculate score for each field
    standardized_data['score'] = standardized_data.apply(lambda row: sum(row[param] * weight for param, weight in weights.items()), axis=1)

    # Save scores to original dataframe
    data['score'] = standardized_data['score']

    # Rank fields based on score (higher scores are more hitter-friendly)
    ranked_fields = data.sort_values('score', ascending=False)

    return ranked_fields

# Suppose 'df' is your DataFrame containing the field data
ranked_fields = rank_fields(df)
print(ranked_fields[['park_name', 'score']])


In [None]:
ranked_fields.info()
merged_df = ranked_fields

In [None]:
df = merged_df

In [None]:

import webcolors

# Assuming df is your DataFrame and it has columns 'color1' and 'color2'

custom_colors = {
    'Maize': '#F2C649',
    'Columbia Blue': '#C4D8E2',
    'Carolina Blue': '#56A0D3',
    'Cardinal': '#C41E3A',
    'Burgundy': '#800020',
    'Forrest Green': '#18453B',
    'Forest Green': '#18453B',
    'Columbia': '#C4D8E2',
    'Royal': '#4169e1',
    'Royal Blue': '#4169e1',
    'Vegas Gold': '#C5B358',
    'Navy Blue': '#000080'
}

def convert_to_hex(color_name):
    if isinstance(color_name, str):  # Check if color_name is a string
        try:
            return webcolors.name_to_hex(color_name)
        except ValueError:
            return custom_colors.get(color_name, '#00FF00')  # default to green if color name not recognized
    else:
        return '#000000'  # default to black if color_name is not a string

# Convert the color columns to string and strip any trailing spaces
df['color1'] = df['color1'].astype(str).str.strip()
df['color2'] = df['color2'].astype(str).str.strip()

# Convert color names to hex values
df['color1'] = df['color1'].apply(convert_to_hex)
df['color2'] = df['color2'].apply(convert_to_hex)



In [None]:
## Recreate the division_final and level columns

## If division column is not null use that value as division_final. if it is null use the value in the regional_division column
df['division_final'] = df['division'].fillna(df['regional_div'])

## Create a level column based if the field hosts a district the value should be 1
## if region_semi_number is present assign level 2 and if region_final_number is present assign level 3
## if finals is present assign level 4
df['level'] = np.where(df['district'].notnull(), 1, 0)
df['level'] = np.where(df['region_semi_number'].notnull(), 2, df['level'])
df['level'] = np.where(df['region_final_quarter'].notnull(), 3, df['level'])
df['level'] = np.where(df['finals'].notnull(), 4, df['level'])

    



In [None]:
# df.info()

## load the display_names csv
path = 'data/MHSAA/MHSAA_display_names.csv'

display_df = pd.read_csv(path)

display_df.head()

In [None]:
## match the park name column from the display_df to the park_name column in the df dataframe

df = df.merge(display_df, on='park_name', how='left')

In [None]:
df.describe()

In [None]:
## Set the output directory for the Outfield Fence Plots

output_dir = 'data/MHSAA/assets/plots/'

def plot_distances(df, row_index):
    # Get rows with 'Min', 'Max', 'Mean', 'Q1', 'Q3' in 'park_name'
    rows_to_plot = df[df['park_name'].isin(['Min', 'Max', 'Mean', 'Q1', 'Q3'])]
    
    # Get the row to be highlighted
    highlighted_row = df.loc[row_index]
    
    # Create a new figure
    plt.figure(figsize=(8,6))
    
    # Loop over these rows and plot a line graph for each
    for index, row in rows_to_plot.iterrows():
        if row['park_name'] in ['Q1', 'Q3']: # If Q1 or Q3, plot thinner, dotted line
            plt.plot(row['distances'], linestyle='dotted', alpha=0.3, color='grey', label=row['park_name'])
        else:
            plt.plot(row['distances'], linestyle='dashed', alpha=0.5, label=row['park_name'])

        # Add text labels for Min, Max and Mean lines
        if row['park_name'] in ['Min', 'Max', 'Mean']:
            plt.text(len(row['distances'])-1, row['distances'][-1], row['park_name'], color='blue', va='center')

        # Check if the current row is 'Min', if so, add shading
        if row['park_name'] == 'Min':
            plt.fill_between(range(len(row['distances'])), row['distances'], color='green', alpha=0.2)

        # Check if the current row is 'Max', if so, add shading
        if row['park_name'] == 'Max':
            plt.fill_between(range(len(row['distances'])), row['distances'], color='yellow', alpha=0.2)

        # Check if the current row is 'Max', if so, add shading above
        if row['park_name'] == 'Max':
            plt.fill_between(range(len(row['distances'])), plt.ylim()[1], row['distances'], color='red', alpha=0.2)
            
    # Plot the highlighted row with a thicker line
    plt.plot(highlighted_row['distances'], linewidth=2, label=highlighted_row['park_name'])
    
    # Set the minimum and maximum values of y-axis
    plt.ylim([270, 420])

    # Change y-axis labels and tick marks to be white
    plt.ylabel('Distance (feet)', color='white')
    plt.tick_params(axis='y', colors='white')

    # Hide x axis ticks
    plt.xticks([])

    # Move the title to the inside the plot, centered, just above the x axis
    plt.text(len(highlighted_row['distances'])/2, 270, highlighted_row['display_name'], ha='center', va='bottom', fontsize=16)

    # Reverse the x-axis
    plt.gca().invert_xaxis()

    # Generate the file path
    file_path = os.path.join(output_dir, f"plot_{row_index}.png")
    
    # Save the figure
    plt.savefig(file_path)

    # Close the figure to free up memory
    plt.close()

    # Return the file path
    return file_path

# Add a new column 'file_path' to the DataFrame to store the file paths
df['file_path'] = [plot_distances(df, i) for i in df.index]


In [None]:
df_merged = df

df_merged.info()

In [None]:
## Drop the rows with null values in the 'fop', 'foul' or 'home_plate' columns
df.dropna(subset=['fop', 'foul', 'home_plate'], inplace=True)

# df.info()

In [None]:
## show me the fields without a display name

df_merged[df_merged['display_name'].isnull()]

In [None]:
### outpus csv to check
df_merged.to_csv('data/MHSAA_FINAL_TEST.csv', index=False)

### OUTPUT JSON TO USE IN MAP
df_merged.to_json('data/html/mhsaa/data/map.json', orient='records')


# End Here

df_merged.iloc[]

In [None]:
## Find Lowell High School in final Dataframe

df_parks[df_parks['park_name'] == 'Lowell High School - high school']


## END BLOCK


In [None]:
# ## Do a FUZZY MATCH OF DF_HOSTS AND DF_PARKS
# # Debugging step to check for non-string values in host_teams
# for team in host_teams:
#     if not isinstance(team, str):
#         print(f"Non-string value found in host_teams: {team}")

# # Debugging step to check for non-string values in park_names
# for park in park_names:
#     if not isinstance(park, str):
#         print(f"Non-string value found in park_names: {park}")

# # Continue with fuzzy matching if no non-string values are found
# matches = [(team, process.extractOne(team, park_names)) for team in host_teams]


In [None]:
# ## List the values from the team column

# print(len(df_hosts['team'].unique()))
# df_hosts['team'].unique()

In [None]:
df_parks = df_cleaned

In [None]:
parks_df = df_cleaned.copy()

parks_df.info()
host_df.info()

In [None]:
def merge_dataframes(parks_df, host_df, min_score=90):
    dict_list = []
    unmatched_rows = []
    # Drop NaN values in 'team' and 'park_name' before matching
    host_names = host_df.team.dropna().unique()
    for name in parks_df.park_name.dropna():  # ignore NaN values
        match = match_team(name, host_names, min_score)
        
        # If no match found, add to unmatched_rows and continue to next iteration
        if match[0] == "":
            unmatched_rows.append(name)
            continue

        dict_ = {}
        dict_.update({"park_name_parks" : name})
        dict_.update({"match_name_host" : match[0]})
        dict_.update({"score" : match[1]})
        dict_list.append(dict_)

    merge_table = pd.DataFrame(dict_list)
    
    # Remove duplicates in merge_table, keeping only the row with the highest score
    merge_table = merge_table.sort_values('score', ascending=False).drop_duplicates(['park_name_parks'], keep='first')

    if 'match_name_host' in merge_table.columns:
        merged_df = pd.merge(parks_df, merge_table, left_on='park_name', right_on='park_name_parks', how='left')
        merged_df = pd.merge(merged_df, host_df, left_on='match_name_host', right_on='team', how='left')
    else:
        print("No matches found.")
        merged_df = None

    return merged_df, unmatched_rows


In [None]:
# merged_df = merged_df.sort_values('score', ascending=False).drop_duplicates(subset=['park_name', 'team'])


In [None]:
merged_df, unmatched_rows = merge_dataframes(parks_df, host_df, min_score=90)

merged_df.info()

In this function, we first match each team name in host_df with the park names in parks_df. If a match with a similarity score greater than the threshold (85 in your case) is found, we record the match in merge_table. If no match is found, we record the team name in unmatched_rows. After going through all team names, we merge host_df and parks_df based on the matches in merge_table.

The function merge_dataframes returns two objects. The first object, merged_df, is a DataFrame that contains the merged data. The second object, unmatched_rows, is a list of team names in host_df for which no match in parks_df could be found. You can inspect unmatched_rows to see which rows couldn't be matched.

In [None]:
def merge_dataframes(parks_df, host_df, min_score=90):
    dict_list = []
    unmatched_rows = []
    # Drop NaN values in 'team' and 'park_name' before matching
    host_names = host_df.team.dropna().unique()
    for name in parks_df.park_name.dropna():  # ignore NaN values
        match = match_team(name, host_names, min_score)
        
        # If no match found, add to unmatched_rows and continue to next iteration
        if match[0] == "":
            unmatched_rows.append(name)
            continue

        dict_ = {}
        dict_.update({"park_name_parks" : name})
        dict_.update({"match_name_host" : match[0]})
        dict_.update({"score" : match[1]})
        dict_list.append(dict_)

    merge_table = pd.DataFrame(dict_list)
    
    # Remove duplicates in merge_table, keeping only the row with the highest score
    merge_table = merge_table.sort_values('score', ascending=False).drop_duplicates(['park_name_parks'], keep='first')

    if 'match_name_host' in merge_table.columns:
        merged_df = pd.merge(parks_df, merge_table, left_on='park_name', right_on='park_name_parks', how='left')
        merged_df = pd.merge(merged_df, host_df, left_on='match_name_host', right_on='team', how='left')
    else:
        print("No matches found.")
        merged_df = None

    return merged_df, unmatched_rows



In [None]:
merged_df.info()

In [None]:
merged_df.head()




In [None]:
merged_df.head()

## WORKING HERE DOWN

# From here down are simple plots to do spot check of data and hold example of polar chart

### FILL IN THE REST OF JSON WITH THE DATA FOR THE 2023 TOURNEY

In [None]:
import os

# Build the file path using os.path.join
file_path = os.path.join('data', 'html', 'mhsaa', 'data', 'tourney_2023.json')

# Save the dataframe to JSON using the constructed file path
parks_df.to_json(file_path, orient='records')


In [None]:
from matplotlib import pyplot as plt

In [None]:
# Histogram of the max distance, min distance, average distance, and median distance

fig, ax = plt.subplots(2, 2, figsize=(12, 8))

ax[0, 0].hist(df_cleaned['max_distance'], bins=20)

ax[0, 1].hist(df_cleaned['min_distance'], bins=20)

ax[1, 0].hist(df_cleaned['avg_distance'], bins=20)

ax[1, 1].hist(df_cleaned['median_distance'], bins=20)

ax[0, 0].set_title('Max Distance')
ax[0, 1].set_title('Min Distance')

ax[1, 0].set_title('Average Distance')
ax[1, 1].set_title('Median Distance')

plt.show()


In [None]:
## Compile a list of fields that are outliers

outlier_fields = df_cleaned[(df_cleaned['max_distance'] > 400) | (df_cleaned['min_distance'] < 200) | (df_cleaned['avg_distance'] > 400) | (df_cleaned['median_distance'] > 400)]

len(outlier_fields)

print(outlier_fields['park_name'].values)



In [None]:
### NEW WITH AUTO SCALING

def calculate_max_y(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    return max(bin_counts)


def create_polar_chart(data, num_bins=36, level_filter=None, y_min=-20, background_color='#2b2b2b', color_map=plt.cm.viridis, bar_alpha=0.8):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    # ax.set_yscale('log')

    # Set dark background
    ax.set_facecolor(background_color)
    plt.gca().set_rlabel_position(22.5)
    y_max = calculate_max_y(data, num_bins=num_bins, level_filter=level_filter) + 5
    ax.set_ylim(y_min, y_max)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(color_map(r / max(bin_counts)))
        bar.set_alpha(bar_alpha)

    plt.show()


In [None]:
### Create a polar chart showing the direction of all the tournment fields


import numpy as np
import matplotlib.pyplot as plt

# create a function to process the data, counting the orientations and filtering by level.

from collections import defaultdict

def process_data(data, level_filter=None):
    count_by_orientation = defaultdict(int)
    
    for record in data:
        if level_filter is None or record['level'] == level_filter:
            orientation = round(record['field_orientation'])
            count_by_orientation[orientation] += 1

    return count_by_orientation

def create_polar_chart(data, num_bins=36, level_filter=None):
    count_by_orientation = process_data(data, level_filter)
    
    # Compute the histogram
    bin_edges = np.linspace(0.0, 2 * np.pi, num_bins + 1)
    bin_counts = np.zeros(num_bins)
    
    for orientation, count in count_by_orientation.items():
        idx = int(orientation / (360 / num_bins))
        if idx == num_bins:
            idx = 0
        bin_counts[idx] += count
    
    bin_width = 2 * np.pi / num_bins

    # Set plot size
    plt.figure(figsize=(10, 10))

    ax = plt.subplot(111, projection='polar')
    ax.set_theta_direction(-1)
    ax.set_theta_offset(np.pi / 2)

    ax.set_facecolor('#808080')
    ###
    # ax.set_yscale('log')

    
    # # Set dark background
    ax.set_facecolor('#2b2b2b')
    plt.gca().set_rlabel_position(22.5)
    ax.set_ylim(-20, 130)  # Adjust based on max count

    # Add bars for negative values
    zero_height_bars = ax.bar(bin_edges[:-1], np.abs(ax.get_ylim()[0]), width=bin_width, bottom=0.0, color='k', alpha=0.3)

    bars = ax.bar(bin_edges[:-1], bin_counts, width=bin_width, bottom=0)
    
    # Use custom colors and opacity
    for r, bar in zip(bin_counts, bars):
        bar.set_facecolor(plt.cm.viridis(r / max(bin_counts)))
        # bar.set_facecolor(plt.cm.plasma(r / max(bin_counts)))
        bar.set_alpha(0.8)

    plt.show()

In [None]:
create_polar_chart(data, num_bins=50, level_filter=None)