In [1]:

## Dependencies and Setup
### Dependencies

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re
import time
import matplotlib.pyplot as plt




In [2]:
## LOAD BLOCK###
#### Load data from kml file exported from Google Earth

file_path = ('TEMP/clean_tables/geo_data/all_high_schools.kml') # file path to kml file


# Read the KML file
with open(file_path) as file:
    xml_data = file.read()

# Initialize soup variables for parsing file
soup = BeautifulSoup(xml_data, 'xml')
folders = soup.Document.Folder
list = soup.Document.Folder.find_all('Folder')

# Create a list to store rows to append to the DataFrame
rows = []

# Loop through the folders and extract the data
for folder in list:
    try:
        field_name = folder.find('name').text
        foul = folder.find_all('coordinates')[0].text
        fop = folder.find_all('coordinates')[1].text
        notes = None

        # Check if there is a description tag, if so, use it for notes
        if folder.find('description') is not None:
            notes = folder.find('description').text

        row = {
            'field': field_name,
            'foul': foul,
            'fop': fop,
            'notes': notes
        }

        rows.append(row)

    except Exception as e:
        # Add name of folder to a list of failed folders
        failed.append(folder.find('name').text)
        print(f"Error processing folder: {folder.find('name').text}. Error message: {str(e)}")

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows, columns=['field', 'foul', 'fop', 'notes'])

In [3]:
# Clean the new dataframe


# Create a copy of the original DataFrame
df_cleaned = df.copy()

# Remove new line and space characters from coordinates
df_cleaned = df_cleaned.replace(r'\n','', regex=True) 
df_cleaned = df_cleaned.replace(r'\t','', regex=True) 

# Drop any duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=['field'], keep='first')

# Drop any rows with empty fields
df_cleaned = df_cleaned[(df_cleaned != 0).all(1)]

##### Clean up polygon data and create a new home_plate column

def parse_coordinates(coord_string):
    coords = coord_string.split()
    parsed_coords = [tuple(map(float, coord.split(',')[:2])) for coord in coords]
    return parsed_coords

# Create a new column for the home_plate location using the first set of coordinates in the 'fop' column
df_cleaned['home_plate'] = df_cleaned['fop'].apply(lambda x: parse_coordinates(x)[0])

# Apply the parse_coordinates function to the 'foul' and 'fop' columns
df_cleaned['foul'] = df_cleaned['foul'].apply(parse_coordinates)
df_cleaned['fop'] = df_cleaned['fop'].apply(parse_coordinates)



In [4]:
############# AREA CALCULATION ##############


import pyproj
from shapely.geometry import Polygon
from shapely.ops import transform


def calculate_area(coords):
    # Create a Polygon object from the coordinates
    polygon = Polygon(coords)

    # Calculate the centroid of the polygon
    centroid = polygon.centroid

    # Create a custom LAEA projection centered on the centroid
    custom_projection = f"+proj=laea +lat_0={centroid.y} +lon_0={centroid.x} +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

    # Create a transformer for converting coordinates to the custom LAEA projection
    transformer = pyproj.Transformer.from_crs(
        pyproj.CRS("EPSG:4326"),  # WGS 84 (latitude and longitude)
        pyproj.CRS(custom_projection),  # Custom LAEA projection
        always_xy=True
    )

    # Define a function to transform coordinates using the transformer
    def transform_coordinates(x, y):
        return transformer.transform(x, y)

    # Convert the coordinates to the custom LAEA projection
    polygon_laea = transform(transform_coordinates, polygon)

    # Calculate the area in square meters
    area_sqm = polygon_laea.area

    # Convert the area to square feet (1 square meter = 10.764 square feet)
    area_sqft = area_sqm * 10.764

    return area_sqft



### Call Function and add to dataframe
df_cleaned['foul_area_sqft'] = df_cleaned['foul'].apply(calculate_area)
df_cleaned['fop_area_sqft'] = df_cleaned['fop'].apply(calculate_area)

## Calculate the total area of the field and the ratio of foul area to field area
df_cleaned['field_area_sqft'] = df_cleaned['foul_area_sqft'] + df_cleaned['fop_area_sqft']
## Percentage foul area
df_cleaned['foul_area_per'] = df_cleaned['foul_area_sqft'] / df_cleaned['field_area_sqft']
## Fair to Foul Ratio
df_cleaned['fair_to_foul'] = df_cleaned['fop_area_sqft'] / df_cleaned['foul_area_sqft']

############# FENCE DISTANCE CALCULATION #############

from geopy.distance import great_circle
import numpy as np



def interpolate_points(start, end, length_ratio):
    start_np = np.array(start)
    end_np = np.array(end)
    return tuple(start_np + (end_np - start_np) * length_ratio)

def calculate_distances(home_plate, outfield_coords, num_points=540):
    def is_same_point(point1, point2, tolerance=1e-6):
        return abs(point1[0] - point2[0]) < tolerance and abs(point1[1] - point2[1]) < tolerance

    home_plate_lat_lon = (home_plate[1], home_plate[0])
    distances = []

    # Calculate total line length
    total_length = 0
    segments = []
    for i in range(len(outfield_coords) - 1):
        start = outfield_coords[i]
        end = outfield_coords[i + 1]
        if not is_same_point(home_plate, start) and not is_same_point(home_plate, end):
            segment_length = great_circle((start[1], start[0]), (end[1], end[0])).feet
            segments.append((start, end, segment_length))
            total_length += segment_length

    # Calculate the distance between equally spaced points
    spacing = total_length / (num_points - 1)

    # Interpolate points and calculate distances
    current_length = 0
    segment_index = 0
    for i in range(num_points):
        while segment_index < len(segments) - 1 and current_length > segments[segment_index][2]:
            current_length -= segments[segment_index][2]
            segment_index += 1

        start, end, segment_length = segments[segment_index]
        length_ratio = current_length / segment_length
        point = interpolate_points(start, end, length_ratio)
        distance = great_circle(home_plate_lat_lon, (point[1], point[0])).feet
        distances.append(distance)

        current_length += spacing

    return distances

# Calculate distances for each row
df_cleaned['distances'] = df_cleaned.apply(lambda row: calculate_distances(row['home_plate'], row['fop']), axis=1)

# Calculate max, min, and average distances for each row
df_cleaned['max_distance'] = df_cleaned['distances'].apply(max)
df_cleaned['min_distance'] = df_cleaned['distances'].apply(min)
df_cleaned['avg_distance'] = df_cleaned['distances'].apply(lambda distances: sum(distances) / len(distances))
# get the median distance
df_cleaned['median_distance'] = df_cleaned['distances'].apply(lambda distances: np.median(distances))

######## CHECK BLOCK ########

## Check how long the distance list is for each row
df_cleaned['num_distances'] = df_cleaned['distances'].apply(len)

## Print the value counts for the 'num_distances' column
df_cleaned['num_distances'].value_counts()

540    494
Name: num_distances, dtype: int64

In [5]:
#### Orienting the map to the home plate location ####

### Find the center of the field
def calculate_centroid(coords):
    x_coords = [coord[0] for coord in coords]
    y_coords = [coord[1] for coord in coords]
    centroid_x = sum(x_coords) / len(coords)
    centroid_y = sum(y_coords) / len(coords)
    return (centroid_x, centroid_y)


## Find the bearing between the home plate and the center of the field
import math

def calculate_bearing(point1, point2):
    lat1, lon1 = math.radians(point1[1]), math.radians(point1[0])
    lat2, lon2 = math.radians(point2[1]), math.radians(point2[0])

    d_lon = lon2 - lon1

    x = math.cos(lat2) * math.sin(d_lon)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(d_lon)

    bearing = math.degrees(math.atan2(x, y))
    bearing = (bearing + 360) % 360  # Normalize the bearing to the range [0, 360)

    return bearing

### Function to classify direction in laymans terms North, South, East, West, ect
def degrees_to_cardinal_direction(degrees):
    directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N']
    index = round(degrees / 45)
    return directions[index]


# Calculate the centroid of the outfield fence coordinates for each row
df_cleaned['fop_centroid'] = df_cleaned['fop'].apply(lambda coords: calculate_centroid(coords[1:]))

# Calculate the bearing between home plate and the centroid for each row
df_cleaned['field_orientation'] = df_cleaned.apply(lambda row: calculate_bearing(row['home_plate'], row['fop_centroid']), axis=1)

# Convert the bearing to a cardinal direction
df_cleaned['field_cardinal_direction'] = df_cleaned['field_orientation'].apply(degrees_to_cardinal_direction)

# rename 'field' to 'park_name'
df_cleaned.rename(columns={'field': 'park_name'}, inplace=True)




In [6]:
## Need to rename dataframe to df for this block 

df = df_cleaned

### Get the Altitiude of each field as well as city and state
### This block will take a while to run, can process about 2 seconds per record

## Get Altitudes of the ballparks
## Get Altitudes of the ballparks
import requests
import pandas as pd
import time
from tqdm import tqdm

api_key = 'AIzaSyA_BhlTupRdBPBhRptQuR6pYorMVYQnRMA'

def get_altitude(lat, lon):
    query = f'https://maps.googleapis.com/maps/api/elevation/json?locations={lat},{lon}&key={api_key}'
    try:
        r = requests.get(query)
        r.raise_for_status()
        data = r.json()
        return data['results'][0]['elevation']
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
        return None
    except KeyError:
        print(f"No results returned for coordinates: {lat}, {lon}")
        return None

def get_city_state(lat, lon):
    query = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={api_key}'
    try:
        r = requests.get(query)
        r.raise_for_status()
        data = r.json()
        results = data['results'][0]['address_components']
        city = next((item['long_name'] for item in results if 'locality' in item['types']), '')
        state = next((item['long_name'] for item in results if 'administrative_area_level_1' in item['types']), '')
        return city, state
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
        return None, None
    except KeyError:
        print(f"No results returned for coordinates: {lat}, {lon}")
        return None, None

altitudes = []
cities = []
states = []
failed_rows = []

for i, coords in tqdm(enumerate(df['home_plate']), total=df['home_plate'].shape[0]):
    altitude = get_altitude(coords[1], coords[0])
    if altitude is None:
        failed_rows.append(i)
    altitudes.append(altitude)
    
    city, state = get_city_state(coords[1], coords[0])
    if city is None or state is None:
        failed_rows.append(i)
    cities.append(city)
    states.append(state)
    
    time.sleep(1)

df['altitude'] = altitudes
df['city'] = cities
df['state'] = states

print(f"Failed rows: {list(set(failed_rows))}")


100%|██████████| 494/494 [13:38<00:00,  1.66s/it]


TypeError: 'ResultSet' object is not callable

In [7]:

df.head()

Unnamed: 0,park_name,foul,fop,notes,home_plate,foul_area_sqft,fop_area_sqft,field_area_sqft,foul_area_per,fair_to_foul,...,min_distance,avg_distance,median_distance,num_distances,fop_centroid,field_orientation,field_cardinal_direction,altitude,city,state
0,Ann Arbor Greenhills HS? - Practice?,"[(-84.01481943219841, 41.86694040616026), (-84...","[(-84.01374704390223, 41.86720354412398), (-84...",,"(-84.01374704390223, 41.86720354412398)",38841.574963,81549.188789,120390.763752,0.322629,2.099534,...,8.246457,277.757618,312.444115,540,"(-84.01441536073496, 41.86766324525184)",312.727969,NW,235.825516,Adrian,Michigan
1,Adams Butzel Complex - Detroit Communication M...,"[(-83.1678186, 42.3966942), (-83.1678776, 42.3...","[(-83.1678186, 42.3966942), (-83.1665385, 42.3...",,"(-83.1678186, 42.3966942)",43132.874486,94900.435141,138033.309627,0.312482,2.200188,...,344.06779,345.883443,345.373813,540,"(-83.16713494545455, 42.39733947575757)",38.039869,NE,199.549713,Detroit,Michigan
2,Addison HS,"[(-84.3395166, 41.9898236), (-84.3406566, 41.9...","[(-84.3395166, 41.9898236), (-84.3394898, 41.9...",,"(-84.3395166, 41.9898236)",30054.010276,81120.874515,111174.884791,0.270331,2.69917,...,304.1969,322.130437,320.458186,540,"(-84.34025218333335, 41.990368462499994)",314.902223,NW,325.612823,Addison,Michigan
3,Adrian HS,"[(-84.0416584, 41.9091676), (-84.0416691, 41.9...","[(-84.0416584, 41.9091676), (-84.0405493, 41.9...",,"(-84.0416584, 41.9091676)",36619.533089,86478.745151,123098.27824,0.297482,2.361547,...,301.179746,334.10481,330.367042,540,"(-84.04087369166668, 41.909742770833326)",45.435207,NE,239.299103,Adrian,Michigan
4,Adrian Lenawee Christian HS,"[(-84.0837469, 41.9087669), (-84.0841117, 41.9...","[(-84.0837469, 41.9087669), (-84.0826392, 41.9...",,"(-84.0837469, 41.9087669)",23307.977485,89142.387688,112450.365172,0.207273,3.824544,...,314.250328,336.91403,341.931638,540,"(-84.08335302608694, 41.909511691304346)",21.482674,N,252.284775,Adrian,Michigan


In [None]:
UnicodeEncodeError



In [8]:
### sAVE THE CITY STATE AND ALTITUDE TO A CSV SO i CAN REFERENCE IT AND SKIP THE STEP

df.to_csv('../BB_CLEAN/data/geo_data/high_school_ballparks.csv', index=False)

## Load the CSV
# df = pd.read_csv('data/2023_mhsaa_POST_LOOKUP2.csv')

## Rename back to df_cleaned to continue with the following blocks
# df_cleaned = df

In [None]:
break

In [1]:
# new depenencies

import pandas as pd
import numpy as np


In [2]:
# rELOAD THE CSV
df_cleaned = pd.read_csv('TEMP/clean_tables/field_geo_prelim.csv')




In [3]:
## Load the mhsaa data and find a way to merge it with the df_cleaned data

# start with taking the street adde4t and city and converting it to the field df.

# take the schools lat and long and return the 3 closest fields with the distance measured in miles



In [4]:
## load mhsaa data

mhsaa = pd.read_csv('TEMP\clean_tables\MHSAA_School_Data.csv')

mhsaa.columns

Index(['SchoolId', 'SportId', 'PopularName', 'ProperName', 'NickName',
       'Colors', 'LeaugeName', 'LeaugeId', 'HeadCoach', 'PrimaryColorCode',
       'ImagePath', 'Classification', 'Record', 'Enrollment', 'City', 'State',
       'Address1', 'Address2', 'Zip', 'Phone', 'Fax', 'VanityUrl', 'MapURL',
       'WebSite', 'IsMiddleSchool', 'ClassificationCalculationSteps',
       'ClassificationCalculationStepsHtml', 'SportSeasonId', 'EnrollmentData',
       'TicketUrl', 'OldSchoolId', 'SchoolNameWithId'],
      dtype='object')

In [5]:
# send Address1 and zip to the df_lookup_table

# use that table to get a lat and long for each school using google

# use the lat and long to find the 3 closest fields

# return the 3 closest fields with the distance measured in miles

# add the 3 fields to a field table witht he name of the field and the distance

df = mhsaa

In [6]:
# import pandas as pd
# import requests
# from tqdm import tqdm

# # # Assuming your dataframe is named df and it has columns 'Address1', 'Address2', 'City', 'State', 'Zip'
# # df = pd.DataFrame({
# #     'Address1': ['1600 Amphitheatre Parkway'],
# #     'Address2': [''],
# #     'City': ['Mountain View'],
# #     'State': ['CA'],
# #     'Zip': ['94043']
# # })

# def get_lat_lng(apiKey, address):
#     """
#     Returns the latitude and longitude of a location using the Google Maps Geocoding API. 
#     API: https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}
#     """

#     url = ('https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'
#            .format(address.replace(' ','+'), apiKey))
#     try:
#         response = requests.get(url)
#         resp_json_payload = response.json()
#         lat = resp_json_payload['results'][0]['geometry']['location']['lat']
#         lng = resp_json_payload['results'][0]['geometry']['location']['lng']
#     except:
#         print('ERROR: {}'.format(address))
#         lat = 0
#         lng = 0
#     return lat, lng

# # tqdm.pandas() is a simple wrapper around the pandas apply function. 
# # It provides a progress bar for apply when it is called on a DataFrame.

# tqdm.pandas()

# # Replace YOUR_API_KEY with your actual Google Maps API Key
# df['Lat'], df['Lng'] = zip(*df.progress_apply(lambda row: get_lat_lng('AIzaSyA_BhlTupRdBPBhRptQuR6pYorMVYQnRMA', ', '.join([str(row['Address1']), str(row['Address2']), str(row['City']), str(row['State']), str(row['Zip'])])), axis=1))

# print(df)


  2%|▏         | 75/4049 [00:13<11:42,  5.66it/s]

ERROR: nan, nan, nan, nan, nan


 10%|█         | 421/4049 [01:23<14:53,  4.06it/s]

ERROR: nan, nan, nan, nan, nan


 13%|█▎        | 517/4049 [01:50<14:50,  3.96it/s]

ERROR: nan, nan, nan, nan, nan


 16%|█▋        | 663/4049 [02:28<13:16,  4.25it/s]

ERROR: nan, nan, nan, nan, nan


 36%|███▋      | 1475/4049 [05:52<09:51,  4.35it/s]

ERROR: Franklin & Delaware, nan, Flint, MI, 48506


 57%|█████▋    | 2296/4049 [09:08<06:09,  4.74it/s]

ERROR: nan, nan, Allison, CN, nan


 59%|█████▉    | 2392/4049 [09:29<05:42,  4.84it/s]

ERROR: nan, nan, Belleviner, CN, nan
ERROR: nan, nan, Shawnee, CN, nan


 61%|██████    | 2479/4049 [09:48<05:29,  4.77it/s]

ERROR: nan, nan, Leamington, CN, nan


 62%|██████▏   | 2529/4049 [09:59<05:39,  4.48it/s]

ERROR: nan, nan, Grove Port, OH, nan


 64%|██████▍   | 2589/4049 [10:11<05:21,  4.55it/s]

ERROR: nan, nan, Chatham, CN, nan
ERROR: nan, nan, Chatham, CN, nan


 64%|██████▍   | 2590/4049 [10:12<04:45,  5.12it/s]

ERROR: nan, nan, Chatham, CN, nan


 66%|██████▌   | 2669/4049 [10:29<05:17,  4.35it/s]

ERROR: nan, nan, Benoit, OH, nan


 72%|███████▏  | 2899/4049 [11:18<03:56,  4.87it/s]

ERROR: nan, nan, Arva, CN, nan


 74%|███████▎  | 2984/4049 [11:35<03:48,  4.65it/s]

ERROR: nan, nan, nan, nan, nan


 74%|███████▎  | 2985/4049 [11:35<03:50,  4.61it/s]

ERROR: nan, nan, nan, OH, nan


 74%|███████▍  | 2992/4049 [11:37<03:18,  5.33it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 74%|███████▍  | 3008/4049 [11:41<04:00,  4.33it/s]

ERROR: nan, nan, nan, nan, nan


 76%|███████▌  | 3083/4049 [11:59<03:17,  4.88it/s]

ERROR: nan, nan, nan, nan, nan


 76%|███████▋  | 3089/4049 [12:00<02:41,  5.93it/s]

ERROR: nan, nan, nan, nan, nan


 76%|███████▋  | 3092/4049 [12:01<02:23,  6.65it/s]

ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3098/4049 [12:02<03:01,  5.25it/s]

ERROR: nan, nan, Collingwood, CN, nan


 77%|███████▋  | 3099/4049 [12:02<03:06,  5.08it/s]

ERROR: nan, nan, Maple, CN, nan


 77%|███████▋  | 3118/4049 [12:06<02:32,  6.12it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3120/4049 [12:06<02:32,  6.08it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3122/4049 [12:07<02:31,  6.12it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3124/4049 [12:07<02:18,  6.70it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3127/4049 [12:07<02:22,  6.48it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 77%|███████▋  | 3128/4049 [12:08<02:20,  6.58it/s]

ERROR: nan, nan, nan, nan, nan


 79%|███████▊  | 3186/4049 [12:19<02:36,  5.51it/s]

ERROR: nan, nan, York, CN, nan


 79%|███████▉  | 3203/4049 [12:23<03:07,  4.50it/s]

ERROR: nan, nan, nan, nan, nan


 79%|███████▉  | 3215/4049 [12:25<03:09,  4.40it/s]

ERROR: nan, nan, Hamilton, CN, nan


 80%|███████▉  | 3229/4049 [12:28<02:47,  4.90it/s]

ERROR: nan, nan, nan, nan, nan


 80%|████████  | 3254/4049 [12:34<02:42,  4.88it/s]

ERROR: nan, nan, nan, nan, nan


 80%|████████  | 3259/4049 [12:34<02:09,  6.10it/s]

ERROR: nan, nan, Hamilton, CN, nan
ERROR: nan, nan, Hamilton, CN, nan


 81%|████████  | 3261/4049 [12:35<02:09,  6.08it/s]

ERROR: nan, nan, Hamilton, CN, nan


 81%|████████  | 3274/4049 [12:38<02:16,  5.66it/s]

ERROR: nan, nan, nan, nan, nan


 81%|████████  | 3283/4049 [12:40<02:53,  4.41it/s]

ERROR: nan, nan, nan, nan, nan


 81%|████████  | 3287/4049 [12:40<02:25,  5.24it/s]

ERROR: nan, nan, nan, nan, nan


 81%|████████▏ | 3290/4049 [12:41<02:11,  5.75it/s]

ERROR: nan, nan, nan, nan, nan


 82%|████████▏ | 3308/4049 [12:45<02:20,  5.29it/s]

ERROR: nan, nan, nan, nan, nan


 82%|████████▏ | 3339/4049 [12:51<01:56,  6.08it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 83%|████████▎ | 3341/4049 [12:51<01:47,  6.58it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 83%|████████▎ | 3343/4049 [12:52<01:47,  6.58it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 83%|████████▎ | 3346/4049 [12:52<02:03,  5.70it/s]

ERROR: nan, nan, nan, nan, nan


 83%|████████▎ | 3362/4049 [12:56<02:18,  4.97it/s]

ERROR: nan, nan, nan, nan, nan


 83%|████████▎ | 3364/4049 [12:56<01:57,  5.85it/s]

ERROR: nan, nan, nan, nan, nan


 85%|████████▍ | 3424/4049 [13:11<02:46,  3.76it/s]

ERROR: nan, nan, Hamilton, CN, nan


 85%|████████▌ | 3443/4049 [13:16<02:22,  4.24it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 85%|████████▌ | 3460/4049 [13:21<02:27,  3.99it/s]

ERROR: nan, nan, nan, nan, nan


 87%|████████▋ | 3516/4049 [13:35<02:28,  3.59it/s]

ERROR: nan, nan, nan, nan, nan


 87%|████████▋ | 3534/4049 [13:39<01:58,  4.34it/s]

ERROR: nan, nan, nan, nan, nan


 87%|████████▋ | 3537/4049 [13:40<02:06,  4.04it/s]

ERROR: nan, nan, Simcoe, CN, nan


 88%|████████▊ | 3573/4049 [13:50<02:06,  3.77it/s]

ERROR: nan, nan, nan, nan, nan


 88%|████████▊ | 3576/4049 [13:51<02:00,  3.93it/s]

ERROR: nan, nan, nan, nan, nan


 88%|████████▊ | 3578/4049 [13:51<01:44,  4.51it/s]

ERROR: nan, nan, nan, nan, nan
ERROR: nan, nan, nan, nan, nan


 88%|████████▊ | 3579/4049 [13:51<01:45,  4.47it/s]

ERROR: nan, nan, nan, nan, nan


 89%|████████▊ | 3584/4049 [13:52<01:44,  4.45it/s]

ERROR: 1200 Oakwood Dr, R #3 Belle River, Ontario, CN, NOR 1AO


 89%|████████▉ | 3597/4049 [13:55<01:27,  5.17it/s]

ERROR: nan, nan, nan, nan, nan


 90%|████████▉ | 3626/4049 [14:02<01:34,  4.45it/s]

ERROR: nan, nan, Ajax, ON, nan


 90%|████████▉ | 3628/4049 [14:03<01:29,  4.71it/s]

ERROR: nan, nan, Ajax, ON, nan


 90%|████████▉ | 3630/4049 [14:03<01:31,  4.57it/s]

ERROR: nan, nan, Hamilton, ON, nan


 90%|████████▉ | 3631/4049 [14:03<01:31,  4.58it/s]

ERROR: nan, nan, Hamilton, ON, nan


 90%|████████▉ | 3633/4049 [14:04<01:23,  4.98it/s]

ERROR: nan, nan, Hamilton, ON, nan
ERROR: nan, nan, nan, nan, nan


 90%|█████████ | 3648/4049 [14:08<01:45,  3.81it/s]

ERROR: nan, nan, nan, nan, nan


 90%|█████████ | 3650/4049 [14:08<01:33,  4.26it/s]

ERROR: nan, nan, nan, nan, nan


 90%|█████████ | 3651/4049 [14:08<01:26,  4.58it/s]

ERROR: nan, nan, nan, nan, nan


 90%|█████████ | 3655/4049 [14:09<01:22,  4.76it/s]

ERROR: nan, nan, nan, nan, nan


 91%|█████████▏| 3695/4049 [14:18<01:20,  4.41it/s]

ERROR: nan, nan, Forest, ON, nan


 91%|█████████▏| 3702/4049 [14:20<01:29,  3.89it/s]

ERROR: nan, nan, Milton, ON, nan


 93%|█████████▎| 3762/4049 [14:34<01:01,  4.66it/s]

ERROR: nan, nan, Ajax, ON, nan


 93%|█████████▎| 3769/4049 [14:36<00:59,  4.68it/s]

ERROR: nan, nan, Mono, ON, nan


 95%|█████████▌| 3866/4049 [14:59<00:40,  4.50it/s]

ERROR: nan, nan, Ajax, ON, nan


 96%|█████████▌| 3868/4049 [14:59<00:38,  4.70it/s]

ERROR: nan, nan, Brooklin, ON, nan


100%|██████████| 4049/4049 [15:43<00:00,  4.29it/s]

      SchoolId  SportId                                   PopularName  \
0         3834        0                                      Brighton   
1         3835        0                              Ewen-Trout Creek   
2         3836        0  Inkster Cherryhill School of Performing Arts   
3         3837        0                                     Roseville   
4         3838        0                                      Shepherd   
...        ...      ...                                           ...   
4044      9995        0            Crossroads Charter Acad Elementary   
4045      9996        0                               Tau Beta School   
4046      9997        0                    Holbrook Elementary School   
4047      9998        0                      Whitmore Lake Elementary   
4048      9999        0           New Mexico School for the Deaf (NM)   

                                ProperName  NickName            Colors  \
0                              Brighton HS  Bulld




In [7]:
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4049 entries, 0 to 4048
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   SchoolId                            4049 non-null   int64  
 1   SportId                             4049 non-null   int64  
 2   PopularName                         4049 non-null   object 
 3   ProperName                          4049 non-null   object 
 4   NickName                            922 non-null    object 
 5   Colors                              2104 non-null   object 
 6   LeaugeName                          606 non-null    object 
 7   LeaugeId                            4049 non-null   int64  
 8   HeadCoach                           0 non-null      float64
 9   PrimaryColorCode                    3117 non-null   object 
 10  ImagePath                           768 non-null    object 
 11  Classification                      737 non

In [10]:
## Save Temp file to csv
# df.to_csv('TEMP/clean_tables/mhsaa_lat_longv2.csv', index=False)


In [36]:
## Load The mhsaa data with lat and long
mhsaa_df = pd.read_csv('TEMP/clean_tables/mhsaa_lat_longv2.csv')

## Load the field data
field_df = pd.read_csv('data/2023_mhsaa_POST_LOOKUP2.csv')

In [37]:
# create lat and lng columns in the field_df - from the home_plate tuple in 'home_plate'
# convert home_plate to aa tuple
field_df['home_plate'] = field_df['home_plate'].apply(lambda x: eval(x))


field_df['Lat'] = field_df['home_plate'].apply(lambda x: x[1])
field_df['Lng'] = field_df['home_plate'].apply(lambda x: x[0])


field_df.columns

field_df.head()

fields_df = field_df.copy()


In [38]:
mhsaa_df.columns

mhsaa_df.head()

Unnamed: 0,SchoolId,SportId,PopularName,ProperName,NickName,Colors,LeaugeName,LeaugeId,HeadCoach,PrimaryColorCode,...,IsMiddleSchool,ClassificationCalculationSteps,ClassificationCalculationStepsHtml,SportSeasonId,EnrollmentData,TicketUrl,OldSchoolId,SchoolNameWithId,Lat,Lng
0,3834,0,Brighton,Brighton HS,Bulldogs,Orange & Black,Kensington Lakes Activities Association,6931,,#003366,...,False,[],"<table class=""TabularData"" ><tbody><tr><th wid...",0,"[{'GradeDisplay': '9', 'GradeTypeCode': 9, 'Bo...",https://gofan.co/app/school/MI9899,1016.0,Brighton (1016),42.525739,-83.793592
1,3835,0,Ewen-Trout Creek,Ewen-Trout Creek HS,Panthers,Black/White/Gold,Copper Mountain Conference,9453,,#003366,...,False,[],"<table class=""TabularData"" ><tbody><tr><th wid...",0,"[{'GradeDisplay': '9', 'GradeTypeCode': 9, 'Bo...",https://gofan.co/app/school/MI19253,1024.0,Ewen-Trout Creek (1024),46.547144,-89.292515
2,3836,0,Inkster Cherryhill School of Performing Arts,Cherryhill School of Performing Arts,Tigers,Brown & Gold,,0,,#003366,...,False,,,0,,,1029.0,Inkster Cherryhill School of Performing Arts (...,42.306236,-83.321653
3,3837,0,Roseville,Roseville HS,Panthers,Red/White/Black,Macomb Area Conference,6940,,#003366,...,False,['Students who are 19 years of age prior to Se...,"<table class=""TabularData"" ><tbody><tr><th wid...",0,"[{'GradeDisplay': '9', 'GradeTypeCode': 9, 'Bo...",https://gofan.co/app/school/MI10075,1040.0,Roseville (1040),42.518727,-82.937551
4,3838,0,Shepherd,Shepherd HS,BlueJays,Blue & Gold,Jack Pine Conference,6929,,#003366,...,False,[],"<table class=""TabularData"" ><tbody><tr><th wid...",0,"[{'GradeDisplay': '9', 'GradeTypeCode': 9, 'Bo...",https://gofan.co/app/school/MI3606,1057.0,Shepherd (1057),43.521972,-84.691067


In [40]:


from geopy.distance import great_circle
import pandas as pd
from tqdm import tqdm

# Define a function to calculate the distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    return great_circle(coords_1, coords_2).miles

# Define a function to find the three closest fields
def find_closest_fields(row, fields_df, num_closest=3):
    distances = fields_df.apply(
        lambda row_field: calculate_distance(row['Lat'], row['Lng'], row_field['Lat'], row_field['Lng']),
        axis=1
    )
    closest_field_indices = distances.nsmallest(num_closest).index
    closest_fields = fields_df.loc[closest_field_indices, 'park_name']
    closest_distances = distances[closest_field_indices]
    return pd.Series(closest_fields.values.tolist() + closest_distances.values.tolist())

# Add columns to mhsaa_df with the three closest fields and their distances
tqdm.pandas()
closest_fields_distances_df = mhsaa_df.progress_apply(find_closest_fields, args=(fields_df, 3), axis=1)
closest_fields_distances_df.columns = ['Closest_Field_1', 'Closest_Field_2', 'Closest_Field_3', 'Distance_1', 'Distance_2', 'Distance_3']

mhsaa_df = pd.concat([mhsaa_df, closest_fields_distances_df], axis=1)

print(mhsaa_df)

100%|██████████| 4049/4049 [01:21<00:00, 49.63it/s]

      SchoolId  SportId                                   PopularName  \
0         3834        0                                      Brighton   
1         3835        0                              Ewen-Trout Creek   
2         3836        0  Inkster Cherryhill School of Performing Arts   
3         3837        0                                     Roseville   
4         3838        0                                      Shepherd   
...        ...      ...                                           ...   
4044      9995        0            Crossroads Charter Acad Elementary   
4045      9996        0                               Tau Beta School   
4046      9997        0                    Holbrook Elementary School   
4047      9998        0                      Whitmore Lake Elementary   
4048      9999        0           New Mexico School for the Deaf (NM)   

                                ProperName  NickName            Colors  \
0                              Brighton HS  Bulld




In [41]:
## look at the mhsaa_df after the function

mhsaa_df.head()

Unnamed: 0,SchoolId,SportId,PopularName,ProperName,NickName,Colors,LeaugeName,LeaugeId,HeadCoach,PrimaryColorCode,...,OldSchoolId,SchoolNameWithId,Lat,Lng,Closest_Field_1,Closest_Field_2,Closest_Field_3,Distance_1,Distance_2,Distance_3
0,3834,0,Brighton,Brighton HS,Bulldogs,Orange & Black,Kensington Lakes Activities Association,6931,,#003366,...,1016.0,Brighton (1016),42.525739,-83.793592,South Lyon HS,Pinckney HS - Field 2,Pinckney HS - Field 1,8.296163,8.667932,8.715206
1,3835,0,Ewen-Trout Creek,Ewen-Trout Creek HS,Panthers,Black/White/Gold,Copper Mountain Conference,9453,,#003366,...,1024.0,Ewen-Trout Creek (1024),46.547144,-89.292515,Ironwood HS,L'Anse HS,Houghton High School,41.320633,42.799937,51.884649
2,3836,0,Inkster Cherryhill School of Performing Arts,Cherryhill School of Performing Arts,Tigers,Brown & Gold,,0,,#003366,...,1029.0,Inkster Cherryhill School of Performing Arts (...,42.306236,-83.321653,Garden City HS,Dearborn HS,Lutheran Westland HS,1.900161,2.90416,3.728332
3,3837,0,Roseville,Roseville HS,Panthers,Red/White/Black,Macomb Area Conference,6940,,#003366,...,1040.0,Roseville (1040),42.518727,-82.937551,Warren Woods Tower HS,Fraser HS,St Clair Shores Lakeshore - Kyte Monroe Park,2.345049,2.346486,2.782978
4,3838,0,Shepherd,Shepherd HS,BlueJays,Blue & Gold,Jack Pine Conference,6929,,#003366,...,1057.0,Shepherd (1057),43.521972,-84.691067,Shepherd HS - high school,Mt Pleasant HS,Mount Pleasant HS,0.286093,5.878995,5.906397


In [44]:
mhsaa_df.sample(20)

Unnamed: 0,SchoolId,SportId,PopularName,ProperName,NickName,Colors,LeaugeName,LeaugeId,HeadCoach,PrimaryColorCode,...,OldSchoolId,SchoolNameWithId,Lat,Lng,Closest_Field_1,Closest_Field_2,Closest_Field_3,Distance_1,Distance_2,Distance_3
1590,5464,0,Dutton Christian Middle Sch,Dutton Christian Middle Sch,,,,0,,#003366,...,81055.0,Dutton Christian Middle Sch (81055),42.842427,-85.587104,East Kentwood HS,Grand Rapids South Christian HS - Field 2,Wyoming Kelloggsville HS,1.860379,2.737867,3.927915
2599,6499,0,Portage (IN),Portage HS,,,,0,,#003366,...,908559.0,Portage (IN) (908559),18.783876,100.778963,Hancock HS,Houghton High School,L'Anse HS,7844.892229,7847.072271,7871.943737
3001,7349,0,Detroit Cristo Rey,Detroit Cristo Rey HS,Wolves,Green & Vegas Gold,Catholic High School League,6901,,#003366,...,9960.0,Detroit Cristo Rey (9960),42.316873,-83.102772,Detroit Martin Luther King HS,Ecorse High School,Melvindale HS,4.927096,5.033318,5.09625
3930,9852,0,Central Elementary School,Central Elementary School,,Blue/White/Gold,,0,,,...,40031.0,Central Elementary School (40031),44.010844,-83.961279,Standish-Sterling HS,Pinconning HS,Au Gres-Sims HS,0.457575,11.05705,13.147769
1845,5737,0,Paul Robeson Malcolm X Academy,Paul Robeson Malcolm X Academy,,,,0,,#003366,...,83747.0,Paul Robeson Malcolm X Academy (83747),42.414524,-83.127557,Detroit Mumford HS,Adams Butzel Complex - Detroit Communication M...,Hazel Park HS,1.813011,2.395178,4.025103
1145,4989,0,Felch North Dickinson JHS,North Dickinson JHS,,Maroon & White,,0,,#003366,...,54583.0,Felch North Dickinson JHS (54583),46.011408,-87.96419,Kingsford HS,Norway High School,Gladstone High School,15.91107,16.658075,45.680907
3028,7376,0,Lincoln Middle School,Lincoln Middle School,,Blue & Gray,,0,,#003366,...,57842.0,Lincoln Middle School (57842),42.161177,-83.609579,Ypsilanti - Lincoln High School,Lincoln High School - Ypsi - Fields 2,Milan HS,0.259061,0.263514,7.344369
1902,5799,0,Oak Harbor (OH),Oak Harbor HS,,,,0,,#003366,...,901355.0,Oak Harbor (OH) (901355),41.506717,-83.146587,Monroe Jefferson HS - Field 2,Monroe Jefferson HS -Varsity,Ottawa Lake - Whiteford High School,32.424885,32.445896,33.798466
846,4682,0,The Dearborn Academy,The Dearborn Academy,,Columbia Blue & White,,0,,#003366,...,50927.0,The Dearborn Academy (50927),42.331062,-83.228266,Dearborn HS,Dearborn Heights Annapolis HS,Melvindale HS,2.53783,4.218544,4.251881
2043,5941,0,Shawano (WI),Shawano HS,,,,0,,#003366,...,902791.0,Shawano (WI) (902791),44.782206,-88.608992,Menominee HS,Merrill HS,Kingsford HS,54.06516,58.949344,74.611227


In [45]:
## Save the mhsaa_df to a csv

mhsaa_df.to_csv('TEMP/clean_tables/MHSAA_School_Data_with_Fields.csv', index=False)

In [None]:
import pandas as pd
import requests



def get_lat_lng(apiKey, address):
    """
    Returns the latitude and longitude of a location using the Google Maps Geocoding API. 
    API: https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}
    """

    url = ('https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'
           .format(address.replace(' ','+'), apiKey))
    try:
        response = requests.get(url)
        resp_json_payload = response.json()
        lat = resp_json_payload['results'][0]['geometry']['location']['lat']
        lng = resp_json_payload['results'][0]['geometry']['location']['lng']
    except:
        print('ERROR: {}'.format(address))
        lat = 0
        lng = 0
    return lat, lng

# Replace YOUR_API_KEY with your actual Google Maps API Key
df['Lat'], df['Lng'] = zip(*df.apply(lambda row: get_lat_lng('AIzaSyA_BhlTupRdBPBhRptQuR6pYorMVYQnRMA', ', '.join([str(row['Address1']), str(row['Address2']), str(row['City']), str(row['State']), str(row['Zip'])])), axis=1))

print(df)


In [None]:
## Look up the lat and long for each school


In [None]:
## Script to take in the high school baseball field kml file and matching it up to MHSAA School data

# Want to tie each field into a school and so I can tie it to the mhsaa info data

# also want a solid location to use for every field and school to do some interesting geopgraphical analysis


# Load the file with the fields geographic and plotting info (post the processing of the kml file)






In [None]:


## Load the MHSAA school data schoolId info - that will be the ultimate key
## school data also has two variations of the name that both might be usefull
# popularName and properName

## Address data is also in the school data - important because we may use it to try to match up the fields

## also want to use it for a cacluation about how far the field is from the school (maybe) and how many games a team gets to play based on the latitude and longitude of the school

mhsaa_df = pd.read_csv('TEMP\clean_tables\MHSAA_School_Data.csv')


