# Distance Notebook
### Calculate the distance each team will need to travel over the course of the season

In [27]:
# Dependencies
import os
import sys
import pandas as pd
import numpy as np


In [28]:
## Paths to source data
# SCHEDULE FILE
schedule_path = os.path.join('..', 'data', 'schedule', 'current_2024.csv')
schedule_data = pd.read_csv(schedule_path)
# ARENA INFO FILE
arena_path = os.path.join('..', 'data', 'arena_school_info.csv')
arena_data = pd.read_csv(arena_path)

# Display data
# schedule_data.head()
# arena_data.head()

### Merge the arena info into schedule table

In [29]:
# Merge the schedule data with the arena data to include home and away team locations

# First, ensure team names match between datasets
# We will merge on the 'Team' column in the arena data and 'Home_Team'/'Away_Team' in the schedule data
merged_data = schedule_data.merge(arena_data[['Team', 'Latitude', 'Longitude']], left_on='Home_Team', right_on='Team', how='left')
merged_data = merged_data.rename(columns={'Latitude': 'Home_Latitude', 'Longitude': 'Home_Longitude'})

# Merge again for the away teams
merged_data = merged_data.merge(arena_data[['Team', 'Latitude', 'Longitude']], left_on='Away_Team', right_on='Team', how='left')
merged_data = merged_data.rename(columns={'Latitude': 'Away_Latitude', 'Longitude': 'Away_Longitude'})

# Drop the unnecessary 'Team' columns from the merged data
merged_data = merged_data.drop(columns=['Team_x', 'Team_y'])

# Display the first few rows of the merged data to verify the result
merged_data.head()

Unnamed: 0,Date,Conference,Game_Notes,Away_Team,Away_Team_Link,Away_Score,Home_Team,Home_Team_Link,Home_Score,OT,Box_Link,Metrics_Link,Day,Game_ID,Home_Latitude,Home_Longitude,Away_Latitude,Away_Longitude
0,2024-10-04,Non-Conference,,Arizona State,/reports/team/Arizona-State/61,,Air Force,/reports/team/Air-Force/1,,,,,Friday,2024-10-04_Air Force_Arizona State,39.013739,-104.883727,33.447156,-111.910867
1,2024-10-05,Non-Conference,,Providence,/reports/team/Providence/46,,Union,/reports/team/Union/54,,,,,Saturday,2024-10-05_Union_Providence,42.818004,-73.924824,41.844005,-71.434748
2,2024-10-05,Non-Conference,,Holy Cross,/reports/team/Holy-Cross/23,,Boston University,/reports/team/Boston-University/10,,,,,Saturday,2024-10-05_Boston University_Holy Cross,42.353838,-71.120653,42.239239,-71.807961
3,2024-10-05,Non-Conference,,Massachusetts,/reports/team/Massachusetts/27,,Bentley,/reports/team/Bentley/8,,,,,Saturday,2024-10-05_Bentley_Massachusetts,42.384852,-71.220488,42.376592,-70.98407
4,2024-10-05,Non-Conference,,Penn State,/reports/team/Penn-State/60,,Alaska,/reports/team/Alaska/4,,,,,Saturday,2024-10-05_Alaska_Penn State,61.190152,-149.827997,40.806567,-77.85701


### Caculate the distance between each school
- Using Haversine equation to calculate the straight line distance between two sets of lat/lon coodinates

In [30]:
# Function to calculate the haversine distance (numpy) between two points (lat1, lon1) and (lat2, lon2)
def haversine(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Radius of Earth in miles
    r = 3956
    return c * r

# Apply the Haversine function to each row to calculate the distance between the home and away arenas
merged_data['Distance_Miles'] = merged_data.apply(
    lambda row: haversine(row['Home_Latitude'], row['Home_Longitude'], row['Away_Latitude'], row['Away_Longitude']),
    axis=1
)

# Display the updated data with the calculated distance
merged_data[['Home_Team', 'Away_Team', 'Distance_Miles']].head()

Unnamed: 0,Home_Team,Away_Team,Distance_Miles
0,Air Force,Arizona State,548.193384
1,Union,Providence,143.786679
2,Boston University,Holy Cross,35.982031
3,Bentley,Massachusetts,12.071364
4,Alaska,Penn State,3226.074838


### Filter out results to avoid double counting games on weekend series

- Travel_Flag to account for consecutive games played at the same venue within a 3-day span. If a team plays multiple games at the same venue within this period, travel is only counted for the first game.

- The Adjusted_Travel_Distance column reflects the distance a team will travel for each game, considering the consecutive game rule.

In [31]:
# Convert 'Date' column to datetime format for easier manipulation
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

# Sort the data by 'Away_Team' and 'Date' to identify consecutive games at the same venue
merged_data = merged_data.sort_values(by=['Away_Team', 'Date'])

# Initialize a flag column to indicate whether the travel should be counted (1 = yes, 0 = no)
merged_data['Travel_Flag'] = 1

# Iterate through the rows and check for consecutive games at the same venue within a 3-day span
for i in range(1, len(merged_data)):
    current_game = merged_data.iloc[i]
    previous_game = merged_data.iloc[i - 1]
    
    # Check if the away team is the same, and the venue (home team) is the same, and the games are within 3 days
    if (current_game['Away_Team'] == previous_game['Away_Team'] and
        current_game['Home_Team'] == previous_game['Home_Team'] and
        (current_game['Date'] - previous_game['Date']).days <= 3):
        # Set the travel flag to 0 for the current game (no additional travel)
        merged_data.at[i, 'Travel_Flag'] = 0

# Only consider rows where travel flag is 1 for calculating total travel distance
merged_data['Adjusted_Travel_Distance'] = merged_data['Distance_Miles'] * merged_data['Travel_Flag']

# Display the updated data with the travel flag and adjusted distance
merged_data[['Away_Team', 'Home_Team', 'Date', 'Distance_Miles', 'Travel_Flag', 'Adjusted_Travel_Distance']].head()


Unnamed: 0,Away_Team,Home_Team,Date,Distance_Miles,Travel_Flag,Adjusted_Travel_Distance
72,Air Force,UNO/UMass,2024-10-12,,1,
154,Air Force,American Int'l,2024-10-25,1698.93605,0,0.0
182,Air Force,American Int'l,2024-10-26,1698.93605,1,1698.93605
217,Air Force,Colorado College,2024-11-01,12.012403,1,12.012403
364,Air Force,Bentley,2024-11-22,1765.750555,1,1765.750555


## Aggregate Total Travel Distance for Each Team AND
## Calculate and store the Trip Count and the Average trip distance

In [33]:
# Only consider rows where travel flag is 1 for calculating total travel distance
merged_data['Adjusted_Travel_Distance'] = merged_data['Distance_Miles'] * merged_data['Travel_Flag']

# Calculate the total travel distance per team
team_travel_distances = merged_data.groupby('Away_Team')['Adjusted_Travel_Distance'].sum().reset_index()
team_travel_distances.columns = ['Team', 'Total_Travel_Distance']

# Step 4: Adding Trip Count and Average Trip Distance

# Calculate the number of trips for each team
trip_count = merged_data[merged_data['Travel_Flag'] == 1].groupby('Away_Team').size().reset_index(name='Trip_Count')

# Merge trip count with travel distances
team_travel_data = pd.merge(team_travel_distances, trip_count, left_on='Team', right_on='Away_Team', how='left').drop(columns='Away_Team')

# Calculate average trip distance
team_travel_data['Average_Trip_Distance'] = team_travel_data['Total_Travel_Distance'] / team_travel_data['Trip_Count']

# Display the top 5 teams with the highest average trip distance
team_travel_data = team_travel_data.sort_values(by='Average_Trip_Distance', ascending=False)
team_travel_data.head()

Unnamed: 0,Team,Total_Travel_Distance,Trip_Count,Average_Trip_Distance
2,Alaska-Anchorage,39351.052398,14.0,2810.789457
1,Alaska,31813.17712,17.0,1871.36336
59,Penn State,3537.513251,2.0,1768.756626
4,Arizona State,18873.701446,15.0,1258.246763
0,Air Force,14454.442493,12.0,1204.536874


## Output the aggrigate as a csv file (Version 1)
- does not account for nuetral site games, mostly games in tournaments. These games often have teams listed like this 'WMU/Michigan State' or 'Bc/BU' for example

In [34]:
# Output into the TEMP folder
output_path = os.path.join('..', 'TEMP', 'team_travel_distances.csv')
team_travel_distances.to_csv(output_path, index=False)
print(f"Output saved to: {output_path}")

PermissionError: [Errno 13] Permission denied: '..\\TEMP\\team_travel_distances.csv'