## TEMP REWORK NOTEBOOK FOR DISTANCE CALC TABLE

In [1]:
# Dependencies
import os
import sys
import pandas as pd
import numpy as np
import geopy
from geopy.distance import geodesic

In [2]:
## Paths to source data

# schedule_path = os.path.join('..', 'data', 'schedule', '2024_current.csv')
schedule_path = os.path.join('..', 'data', 'schedule', 'CHN_Schedule_First Pass_v1.csv')
schedule_data = pd.read_csv(schedule_path)
raw_schedule_df = schedule_data.copy()

# ARENA INFO FILE
arena_path = os.path.join('..', 'data', 'arena_school_info.csv')
arena_data = pd.read_csv(arena_path)
arena_info_df = arena_data.copy()

# NEUTRAL SITE ARENA INFORMATION FILE
neutral_path = os.path.join('..', 'data', 'neutral_arenas_2024.csv')
neutral_arenas_df = pd.read_csv(neutral_path)

# Display data
# schedule_data.head()
# schedule_data.tail()
# arena_data.head()
# neutral_arenas_df.tail()

In [3]:
# Complete refactored code block with corrections
## Ensure the Data column in the schedule data is in datetime format
raw_schedule_df['Date'] = pd.to_datetime(raw_schedule_df['Date'])

from geopy.distance import geodesic

# ## Remove problem characters (', -, .) from all Team columns
raw_schedule_df['Home_Team'] = raw_schedule_df['Home_Team'].str.replace("'", "")
raw_schedule_df['Away_Team'] = raw_schedule_df['Away_Team'].str.replace("'", "")
raw_schedule_df['Home_Team'] = raw_schedule_df['Home_Team'].str.replace("-", " ")
raw_schedule_df['Away_Team'] = raw_schedule_df['Away_Team'].str.replace("-", " ")
raw_schedule_df['Home_Team'] = raw_schedule_df['Home_Team'].str.replace(".", "")
raw_schedule_df['Away_Team'] = raw_schedule_df['Away_Team'].str.replace(".", "")

# Remove games that are still TBD
# Remove all rows where either team is TBD
raw_schedule_df = raw_schedule_df[~raw_schedule_df['Home_Team'].str.contains('TBD')]
raw_schedule_df = raw_schedule_df[~raw_schedule_df['Away_Team'].str.contains('TBD')]

# Remove any games with / in the team names
raw_schedule_df = raw_schedule_df[~raw_schedule_df['Home_Team'].str.contains('/')]
raw_schedule_df = raw_schedule_df[~raw_schedule_df['Away_Team'].str.contains('/')]

# Helper function to calculate distance between two sets of coordinates
def calculate_distance(coord1, coord2):
    if pd.notna(coord1[0]) and pd.notna(coord2[0]):
        return geodesic(coord1, coord2).miles
    return 0

# Step 1: Filter out exhibition games, games with "TBA", and games with "/" in team names
raw_schedule_filtered = raw_schedule_df[
    (raw_schedule_df['Conference'] != 'Exhibition') &
    (~raw_schedule_df['Away_Team'].str.contains('TBA')) &
    (~raw_schedule_df['Home_Team'].str.contains('TBA')) &
    (~raw_schedule_df['Away_Team'].str.contains('/')) &
    (~raw_schedule_df['Home_Team'].str.contains('/'))
]

# Step 2: Flag neutral site games using the 'Flag' column from the neutral arenas table
def is_neutral_game(row, flags):
    # Check for flags in both Conference and Game_Notes columns
    conference_match = any(flag in str(row['Conference']) for flag in flags)
    notes_match = any(flag in str(row['Game_Notes']) for flag in flags)
    return conference_match or notes_match

neutral_flags = neutral_arenas_df['Flag'].tolist()
raw_schedule_filtered['Is_Neutral_Game'] = raw_schedule_filtered.apply(is_neutral_game, axis=1, flags=neutral_flags)

# Step 3: Merge arena coordinates for home and away teams
# Merge home team coordinates
schedule_with_coords = raw_schedule_filtered.merge(arena_info_df[['Team', 'Latitude', 'Longitude']], 
                                                   left_on='Home_Team', right_on='Team', how='left')
schedule_with_coords = schedule_with_coords.rename(columns={'Latitude': 'Home_Lat', 'Longitude': 'Home_Lon'})

# Merge away team coordinates
schedule_with_coords = schedule_with_coords.merge(arena_info_df[['Team', 'Latitude', 'Longitude']], 
                                                  left_on='Away_Team', right_on='Team', how='left')
schedule_with_coords = schedule_with_coords.rename(columns={'Latitude': 'Away_Lat', 'Longitude': 'Away_Lon'})

# Step 4: Categorize games into on-campus conference, on-campus non-conference, and neutral site
def categorize_game(row):
    if row['Is_Neutral_Game']:
        return 'Neutral'
    elif row['Conference'] == 'Non-Conference':
        return 'On-Campus Non-Conference'
    else:
        return 'On-Campus Conference'

schedule_with_coords['Game_Type'] = schedule_with_coords.apply(categorize_game, axis=1)

# Step 5: Calculate the distance for all games
schedule_with_coords['Distance'] = schedule_with_coords.apply(
    lambda row: calculate_distance((row['Away_Lat'], row['Away_Lon']), (row['Home_Lat'], row['Home_Lon'])), axis=1)

# Adjust the neutral site distance calculation using the neutral arena coordinates for both teams
def calculate_neutral_distance(row, neutral_df):
    if row['Is_Neutral_Game']:
        # Find the neutral site coordinates from the neutral arenas table
        neutral_site = neutral_df[neutral_df['Flag'].apply(lambda x: x in str(row['Conference']) or x in str(row['Game_Notes']))]
        
        if not neutral_site.empty:
            neutral_lat = neutral_site.iloc[0]['latitude']
            neutral_lon = neutral_site.iloc[0]['longitude']
            # Calculate distance from away team to neutral site
            away_to_neutral = calculate_distance((row['Away_Lat'], row['Away_Lon']), (neutral_lat, neutral_lon))
            # Calculate distance from home team to neutral site
            home_to_neutral = calculate_distance((row['Home_Lat'], row['Home_Lon']), (neutral_lat, neutral_lon))
            # Return both distances separately to track for both teams
            return away_to_neutral, home_to_neutral
    # Non-neutral games: apply the same distance for both teams (travel for the away team)
    return row['Distance'], row['Distance']

# Apply the new logic for neutral site games using the correct neutral coordinates
schedule_with_coords[['Away_Distance', 'Home_Distance']] = schedule_with_coords.apply(calculate_neutral_distance, axis=1, result_type='expand', neutral_df=neutral_arenas_df)

### NEW CODE ###
### NEW CODE ###
# Convert 'Date' column to datetime format for easier manipulation
schedule_with_coords['Date'] = pd.to_datetime(schedule_with_coords['Date'])

# Reset the travel flag (initialize to 1 for all games)
schedule_with_coords['Travel_Flag'] = 1

# Sort the data by 'Away_Team', 'Date', and 'Home_Team' to ensure games are grouped correctly
schedule_with_coords = schedule_with_coords.sort_values(by=['Away_Team', 'Date', 'Home_Team'])

# Group by 'Away_Team' to identify consecutive away games
for away_team, group in schedule_with_coords.groupby('Away_Team'):
    group = group.sort_values(by='Date')  # Sort by date within each group

    # Iterate through the group to check for consecutive games
    for i in range(1, len(group)):
        current_game = group.iloc[i]
        previous_game = group.iloc[i - 1]

        # Check if the games are within 3 days and are at the same venue OR if either is a neutral game
        if (current_game['Date'] - previous_game['Date']).days <= 3 and \
           ((current_game['Home_Lat'] == previous_game['Home_Lat'] and \
             current_game['Home_Lon'] == previous_game['Home_Lon']) or \
            current_game['Is_Neutral_Game'] or previous_game['Is_Neutral_Game']):
            # Set the travel flag to 0 for the current game (indicating no new trip)
            schedule_with_coords.loc[current_game.name, 'Travel_Flag'] = 0

########## HOTFIX to adress the Neutral games incorrectly left out of travel stats ##########
## Set Travel_Flag for all Neutral games to 1
schedule_with_coords.loc[schedule_with_coords['Is_Neutral_Game'], 'Travel_Flag'] = 1



# # Only consider rows where Travel_Flag is 1 for calculating total travel distance
schedule_with_coords['Adjusted_Away_Distance'] = schedule_with_coords['Away_Distance'] * schedule_with_coords['Travel_Flag']
schedule_with_coords['Adjusted_Home_Distance'] = schedule_with_coords['Home_Distance'] * schedule_with_coords['Travel_Flag']

# Define a function to get the shortest non-zero distance
def shortest_non_zero(series):
    non_zero_values = series[series > 0]
    if len(non_zero_values) > 0:
        return non_zero_values.min()  # Return the minimum non-zero value
    return 0  # If all distances are zero, return zero

# Filter for on-campus games (non-neutral)
on_campus_games = schedule_with_coords[schedule_with_coords['Is_Neutral_Game'] == False]

# Filter for neutral-site games
neutral_site_games = schedule_with_coords[schedule_with_coords['Is_Neutral_Game'] == True]

# Group by team and game type for AWAY teams for on-campus games only
away_team_on_campus_travel_stats = on_campus_games.groupby(['Away_Team', 'Game_Type']).agg(
    total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
    total_distance=('Adjusted_Away_Distance', 'sum'),
    longest_trip=('Adjusted_Away_Distance', 'max'),
    shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
).reset_index()

# Group by team and game type for BOTH home and away teams for neutral-site games
neutral_site_travel_stats = neutral_site_games.groupby(['Home_Team', 'Game_Type']).agg(
    total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
    total_distance=('Adjusted_Home_Distance', 'sum'),  # For neutral games, we consider home team distance
    longest_trip=('Adjusted_Home_Distance', 'max'),
    shortest_trip=('Adjusted_Home_Distance', shortest_non_zero)
).reset_index()

# Also consider the away team's travel for neutral site games
away_neutral_site_travel_stats = neutral_site_games.groupby(['Away_Team', 'Game_Type']).agg(
    total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
    total_distance=('Adjusted_Away_Distance', 'sum'),  # Away team distance for neutral games
    longest_trip=('Adjusted_Away_Distance', 'max'),
    shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)
).reset_index()

# Combine away team stats for on-campus and neutral games
travel_stats_combined = pd.concat([
    away_team_on_campus_travel_stats.rename(columns={'Away_Team': 'Team'}),
    away_neutral_site_travel_stats.rename(columns={'Away_Team': 'Team'}),
    neutral_site_travel_stats.rename(columns={'Home_Team': 'Team'})
], axis=0)

# Step 10: Aggregate the final travel stats for each team
# Correct the average calculation by dividing the total_distance by total_trips
team_travel_stats_final = travel_stats_combined.groupby(['Team', 'Game_Type']).agg(
    total_trips=('total_trips', 'sum'),
    total_distance=('total_distance', 'sum'),  # Ensure this correctly sums up unique distances
    longest_trip=('longest_trip', 'max'),
    # Calculate the correct average by dividing total_distance by total_trips (not by number of rows/games)
    average_distance=('total_distance', lambda x: x.sum() / x.count() if x.count() > 0 else 0),
    shortest_trip=('shortest_trip', lambda x: x[x > 0].min() if (x > 0).any() else 0)
).reset_index()

# Now, for the average distance calculation, ensure we're dividing by total_trips
team_travel_stats_final['average_distance'] = team_travel_stats_final.apply(
    lambda row: row['total_distance'] / row['total_trips'] if row['total_trips'] > 0 else 0, axis=1)

# Step 11: Pivot the final travel stats to match the required format
team_travel_summary_final = team_travel_stats_final.pivot_table(
    index='Team',
    columns='Game_Type',
    values=['total_trips', 'total_distance', 'average_distance', 'longest_trip', 'shortest_trip'],
    fill_value=0
)


# # Define a function to get the shortest non-zero distance
# def shortest_non_zero(series):
#     non_zero_values = series[series > 0]
#     if len(non_zero_values) > 0:
#         return non_zero_values.min()  # Return the minimum non-zero value
#     return 0  # If all distances are zero, return zero

# # Filter for on-campus games (non-neutral)
# on_campus_games = schedule_with_coords[schedule_with_coords['Is_Neutral_Game'] == False]

# # Filter for neutral-site games
# neutral_site_games = schedule_with_coords[schedule_with_coords['Is_Neutral_Game'] == True]

# # Group by team and game type for AWAY teams for on-campus games only
# away_team_on_campus_travel_stats = on_campus_games.groupby(['Away_Team', 'Game_Type']).agg(
#     total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
#     total_distance=('Adjusted_Away_Distance', 'sum'),
#     average_distance=('Adjusted_Away_Distance', 'mean'),
#     longest_trip=('Adjusted_Away_Distance', 'max'),
#     shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
# ).reset_index()

# # Group by team and game type for BOTH home and away teams for neutral-site games
# neutral_site_travel_stats = neutral_site_games.groupby(['Home_Team', 'Game_Type']).agg(
#     total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
#     total_distance=('Adjusted_Home_Distance', 'sum'),  # For neutral games, we consider home team distance
#     average_distance=('Adjusted_Home_Distance', 'mean'),
#     longest_trip=('Adjusted_Home_Distance', 'max'),
#     shortest_trip=('Adjusted_Home_Distance', shortest_non_zero)
# ).reset_index()

# # Also consider the away team's travel for neutral site games
# away_neutral_site_travel_stats = neutral_site_games.groupby(['Away_Team', 'Game_Type']).agg(
#     total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
#     total_distance=('Adjusted_Away_Distance', 'sum'),  # Away team distance for neutral games
#     average_distance=('Adjusted_Away_Distance', 'mean'),
#     longest_trip=('Adjusted_Away_Distance', 'max'),
#     shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)
# ).reset_index()

# # Combine away team stats for on-campus and neutral games
# travel_stats_combined = pd.concat([
#     away_team_on_campus_travel_stats.rename(columns={'Away_Team': 'Team'}),
#     away_neutral_site_travel_stats.rename(columns={'Away_Team': 'Team'}),
#     neutral_site_travel_stats.rename(columns={'Home_Team': 'Team'})
# ], axis=0)

# # Step 10: Aggregate the final travel stats for each team
# team_travel_stats_final = travel_stats_combined.groupby(['Team', 'Game_Type']).agg(
#     total_trips=('total_trips', 'sum'),
#     total_distance=('total_distance', 'sum'),  # Ensure this correctly sums up unique distances
#     average_distance=('average_distance', 'mean'),
#     longest_trip=('longest_trip', 'max'),
#     # get the shortest non-zero trip distance
#     shortest_trip=('shortest_trip', lambda x: x[x > 0].min() if (x > 0).any() else 0)
# ).reset_index()

# # Step 11: Pivot the final travel stats to match the required format
# team_travel_summary_final = team_travel_stats_final.pivot_table(
#     index='Team',
#     columns='Game_Type',
#     values=['total_trips', 'total_distance', 'average_distance', 'longest_trip', 'shortest_trip'],
#     fill_value=0
# )

######### OLD CODE - HANDLES TOTAL DISTANCE INCORRECTLY #########
# # Define a function to get the shortest non-zero distance
# def shortest_non_zero(series):
#     non_zero_values = series[series > 0]
#     if len(non_zero_values) > 0:
#         return non_zero_values.min()  # Return the minimum non-zero value
#     return 0  # If all distances are zero, return zero

# # Group by team and game type for both away and home travel distances
# away_team_travel_stats = schedule_with_coords.groupby(['Away_Team', 'Game_Type']).agg(
#     total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
#     total_distance=('Adjusted_Away_Distance', 'sum'),
#     average_distance=('Adjusted_Away_Distance', 'mean'),
#     longest_trip=('Adjusted_Away_Distance', 'max'),
#     shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
# ).reset_index()

# # Home teams need separate stats for neutral games
# home_team_travel_stats = schedule_with_coords[schedule_with_coords['Is_Neutral_Game']].groupby(['Home_Team', 'Game_Type']).agg(
#     total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag for neutral games as well
#     total_distance=('Adjusted_Home_Distance', 'sum'),
#     average_distance=('Adjusted_Home_Distance', 'mean'),
#     longest_trip=('Adjusted_Home_Distance', 'max'),
#     shortest_trip=('Adjusted_Home_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
# ).reset_index()

# # Step 9: Combine both the away and home team travel stats into a unified DataFrame
# # Make sure to avoid double-counting distances in the aggregation
# travel_stats_combined = pd.concat([away_team_travel_stats.rename(columns={'Away_Team': 'Team'}), 
#                                    home_team_travel_stats.rename(columns={'Home_Team': 'Team'})], axis=0)

# # Step 10: Aggregate the final travel stats for each team
# team_travel_stats_final = travel_stats_combined.groupby(['Team', 'Game_Type']).agg(
#     total_trips=('total_trips', 'sum'),
#     total_distance=('total_distance', 'sum'),  # Ensure this correctly sums up unique distances
#     average_distance=('average_distance', 'mean'),
#     longest_trip=('longest_trip', 'max'),
#     # get the shortest non-zero trip distance
#     shortest_trip=('shortest_trip', lambda x: x[x > 0].min() if (x > 0).any() else 0)  # Avoid 0 affecting the shortest trip
# ).reset_index()

# # Step 11: Pivot the final travel stats to match the required format
# team_travel_summary_final = team_travel_stats_final.pivot_table(
#     index='Team',
#     columns='Game_Type',
#     values=['total_trips', 'total_distance', 'average_distance', 'longest_trip', 'shortest_trip'],
#     fill_value=0
# )


# Step 12: Flatten the columns for readability
team_travel_summary_final.columns = ['_'.join(col).strip() for col in team_travel_summary_final.columns]

# Step 13: Rename and reorder columns to the desired format
team_travel_summary_reset = team_travel_summary_final.reset_index()

new_column_names_final = {
    'total_trips_Neutral': 'N_trips',
    'total_distance_Neutral': 'N_total_distance',
    'average_distance_Neutral': 'N_AVG',
    'longest_trip_Neutral': 'N_longest',
    'shortest_trip_Neutral': 'N_shortest',
    'total_trips_On-Campus Non-Conference': 'non_con_trips',
    'total_distance_On-Campus Non-Conference': 'non_con_total_distance',
    'average_distance_On-Campus Non-Conference': 'non_con_AVG',
    'longest_trip_On-Campus Non-Conference': 'non_con_longest',
    'shortest_trip_On-Campus Non-Conference': 'non_con_shortest',
    'total_trips_On-Campus Conference': 'con_trips',
    'total_distance_On-Campus Conference': 'con_total_distance',
    'average_distance_On-Campus Conference': 'con_AVG',
    'longest_trip_On-Campus Conference': 'con_longest',
    'shortest_trip_On-Campus Conference': 'con_shortest'
}

team_travel_summary_reset.rename(columns=new_column_names_final, inplace=True)


new_column_names_final = {
    'total_trips_Neutral': 'N_trips',
    'total_distance_Neutral': 'N_total_distance',
    'average_distance_Neutral': 'N_AVG',
    'longest_trip_Neutral': 'N_longest',
    'shortest_trip_Neutral': 'N_shortest',
    'total_trips_On-Campus Non-Conference': 'non_con_trips',
    'total_distance_On-Campus Non-Conference': 'non_con_total_distance',
    'average_distance_On-Campus Non-Conference': 'non_con_AVG',
    'longest_trip_On-Campus Non-Conference': 'non_con_longest',
    'shortest_trip_On-Campus Non-Conference': 'non_con_shortest',
    'total_trips_On-Campus Conference': 'con_trips',
    'total_distance_On-Campus Conference': 'con_total_distance',
    'average_distance_On-Campus Conference': 'con_AVG',
    'longest_trip_On-Campus Conference': 'con_longest',
    'shortest_trip_On-Campus Conference': 'con_shortest'
}

team_travel_summary_reset.rename(columns=new_column_names_final, inplace=True)

# Final columns order
columns_order_final = [
    'Team', 'N_trips', 'N_total_distance', 'N_AVG', 'N_longest', 'N_shortest',
    'non_con_trips', 'non_con_total_distance', 'non_con_AVG', 'non_con_longest', 'non_con_shortest',
    'con_trips', 'con_total_distance', 'con_AVG', 'con_longest', 'con_shortest'
]

team_travel_summary_final_display = team_travel_summary_reset[columns_order_final]

# Display the final table
# team_travel_summary_final_display.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_schedule_filtered['Is_Neutral_Game'] = raw_schedule_filtered.apply(is_neutral_game, axis=1, flags=neutral_flags)


In [4]:
## OUTPUT THE TABLE TO CSV
## IN TEMP FOLDER
output_path = os.path.join('..', 'TEMP', 'team_travel_summary_prelim_v1.csv')

In [5]:
# # Show only Michginan teams to quality check
# team_travel_summary_final_display[team_travel_summary_final_display['Team'].str.contains('Michigan')]

## Show Boston College to quality check
team_travel_summary_final_display[team_travel_summary_final_display['Team'].str.contains('Boston College')]
# Show Boston College schedule (both home and away) to quality check
# schedule_with_coords[(schedule_with_coords['Away_Team'] == 'Boston College') | (schedule_with_coords['Home_Team'] == 'Boston College')]

Unnamed: 0,Team,N_trips,N_total_distance,N_AVG,N_longest,N_shortest,non_con_trips,non_con_total_distance,non_con_AVG,non_con_longest,non_con_shortest,con_trips,con_total_distance,con_AVG,con_longest,con_shortest
10,Boston College,1.0,5.842697,5.842697,5.842697,5.842697,2.0,1843.482623,921.741312,1163.616357,679.866266,11.0,555.820309,50.529119,179.124147,2.857146


## Find The Closest Other Team to Each team as well as how many times they play each other this season

In [6]:
from geopy.distance import geodesic
import numpy as np

# Helper function to calculate the distance between two points (lat, lon)
def calculate_distance(lat1, lon1, lat2, lon2):
    if pd.notnull(lat1) and pd.notnull(lon1) and pd.notnull(lat2) and pd.notnull(lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).miles
    else:
        return None  # Return None if any coordinates are missing

# Function to find the closest team to a specific team in the arena data
def find_closest_team(current_team_row, team_data):
    # Initialize variables to store the closest team and distance
    closest_team = None
    closest_distance = np.inf
    
    # Iterate over each row in the team data
    for _, row in team_data.iterrows():
        # Calculate the distance between the current team and the other team
        distance = calculate_distance(current_team_row['Latitude'], current_team_row['Longitude'], row['Latitude'], row['Longitude'])
        
        # Update the closest team if the distance is smaller
        if distance is not None and distance < closest_distance:
            closest_team = row['Team']
            closest_distance = distance
    
    return closest_team, closest_distance

# Find the closest team to each team in the arena data
closest_teams = []
closest_distances = []

# Iterate over each row in the arena_info_df to find the closest team
for _, row in arena_info_df.iterrows():
    # Exclude the current team from the comparison
    other_teams = arena_info_df[arena_info_df['Team'] != row['Team']]
    closest_team, closest_distance = find_closest_team(row, other_teams)
    
    closest_teams.append(closest_team)
    closest_distances.append(closest_distance)

# Add the closest team and distance to the arena data
arena_info_df['Closest_Team'] = closest_teams
arena_info_df['Closest_Distance'] = closest_distances

# Merge the closest team data with the team travel data
team_travel_summary_reset = team_travel_summary_reset.merge(
    arena_info_df[['Team', 'Closest_Team', 'Closest_Distance']], on='Team', how='left')

# Sort by closest distance to another team (optional, based on your preference)
# team_travel_summary_reset = team_travel_summary_reset.sort_values(by='Closest_Distance')

# Display the updated team travel summary with closest team information
# team_travel_summary_reset.head()


In [7]:
# Rename Schedule dataframe (already using schedule_with_coords)
df_schedule = schedule_with_coords.copy()

# Remove Exhibition games from schedule
df_schedule = df_schedule[df_schedule['Conference'] != 'Exhibition']

# Match the teams in the schedule with their closest team from the travel distance data
# Merging schedule data with closest team info for both home and away teams
df_schedule_merged = df_schedule.merge(
    team_travel_summary_reset[['Team', 'Closest_Team']],
    left_on='Home_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Home')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Home'}, inplace=True)

df_schedule_merged = df_schedule_merged.merge(
    team_travel_summary_reset[['Team', 'Closest_Team']],
    left_on='Away_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Away')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Away'}, inplace=True)

# Now, let's ensure both teams (home and away) are being compared properly
df_schedule_merged['Home_vs_Closest'] = df_schedule_merged['Away_Team'] == df_schedule_merged['Closest_Team_Home']
df_schedule_merged['Away_vs_Closest'] = df_schedule_merged['Home_Team'] == df_schedule_merged['Closest_Team_Away']

# Count how many times each team plays its closest opponent as either home or away
df_closest_match_count_home = df_schedule_merged.groupby('Home_Team').agg({
    'Home_vs_Closest': 'sum'
}).reset_index()

df_closest_match_count_away = df_schedule_merged.groupby('Away_Team').agg({
    'Away_vs_Closest': 'sum'
}).reset_index()

# Merge both home and away counts to ensure consistency for both teams
df_closest_match_total = pd.merge(
    df_closest_match_count_home, 
    df_closest_match_count_away, 
    left_on='Home_Team', 
    right_on='Away_Team', 
    how='outer'
)

# Replace missing values with 0 before summing up
df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)

# Calculate the total closest matches by summing up both columns
df_closest_match_total['Total_Closest_Matches'] = df_closest_match_total['Home_vs_Closest'] + df_closest_match_total['Away_vs_Closest']

# Rename columns for clarity and drop unneeded ones
df_closest_match_total = df_closest_match_total[['Home_Team', 'Total_Closest_Matches']].rename(columns={'Home_Team': 'Team'})

# Merge this back into the travel data
team_travel_summary_reset = team_travel_summary_reset.merge(
    df_closest_match_total[['Team', 'Total_Closest_Matches']],
    on='Team',
    how='left'
)

# Fill missing values with 0
team_travel_summary_reset['Total_Closest_Matches'].fillna(0, inplace=True)

# Add Total_Trips column to each team based on the sum of all trip types
team_travel_summary_reset['Total_Trips'] = team_travel_summary_reset['N_trips'] + team_travel_summary_reset['non_con_trips'] + team_travel_summary_reset['con_trips']


# Display the updated travel data with the total closest matches
team_travel_summary_reset.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

Unnamed: 0,Team,N_AVG,con_AVG,non_con_AVG,N_longest,con_longest,non_con_longest,N_shortest,con_shortest,non_con_shortest,N_total_distance,con_total_distance,non_con_total_distance,N_trips,con_trips,non_con_trips,Closest_Team,Closest_Distance,Total_Closest_Matches,Total_Trips
0,Air Force,600.521127,1580.446414,888.501485,600.521127,1771.362615,1764.998085,600.521127,1316.604241,12.004884,600.521127,9482.678486,1777.002969,1.0,6.0,2.0,Colorado College,12.004884,2.0,9.0
1,Alaska,2721.985269,0.0,2389.290233,2721.985269,0.0,3275.395086,2721.985269,0.0,260.422806,2721.985269,0.0,28671.482794,1.0,0.0,12.0,Alaska Anchorage,260.422806,6.0,13.0
2,Alaska Anchorage,0.0,0.0,2642.712132,0.0,0.0,3401.368614,0.0,0.0,260.422806,0.0,0.0,31712.545584,0.0,0.0,12.0,Alaska,260.422806,6.0,12.0
3,American Intl,0.0,190.797952,209.194234,0.0,410.16097,565.237531,0.0,39.21897,51.976219,0.0,1335.585666,1045.971171,0.0,7.0,5.0,Massachusetts,18.774338,1.0,12.0
4,Arizona State,1.751265,1059.710428,1464.407292,1.751265,1563.33154,2271.357695,1.751265,543.441609,548.654698,1.751265,6358.262567,4393.221877,1.0,6.0,3.0,Colorado College,543.441609,4.0,10.0


In [8]:
## REORDER COLUMNS FOR READABILITY

# Final column order
final_column_order = [
    'Team', 'Total_Trips', 'N_trips', 'N_total_distance', 'N_AVG', 'N_shortest', 'N_longest',
    'non_con_trips', 'non_con_total_distance', 'non_con_AVG', 'non_con_shortest', 'non_con_longest',
    'con_trips', 'con_total_distance', 'con_AVG', 'con_shortest', 'con_longest',
    'Closest_Team', 'Closest_Distance', 'Total_Closest_Matches'
]

# Reorder the columns of the dataframe
team_travel_summary_reset = team_travel_summary_reset[final_column_order]

# Round all columns to 2 decimal places
team_travel_summary_reset = team_travel_summary_reset.round(2)
# Set the trip count columns to integer type
team_travel_summary_reset[['Total_Trips', 'N_trips', 'non_con_trips', 'con_trips', 'Total_Closest_Matches']] = team_travel_summary_reset[['Total_Trips', 'N_trips', 'non_con_trips', 'con_trips', 'Total_Closest_Matches']].astype(int)




In [9]:
# Clean up the team table a little, it includes some extra teams that are not part of D1

print(f'Original team count: {len(team_travel_summary_reset)}')

## ALL D1 TEAMS
# Define the conference membership mapping
conference_mapping = {
    'atlantic': ['Air Force', "American Intl", 'Army', 'Bentley', 'Canisius', 'Holy Cross', 'Mercyhurst', 
                 'Niagara', 'RIT', 'Robert Morris', 'Sacred Heart'],
    'big_ten': ['Michigan', 'Michigan State', 'Minnesota', 'Notre Dame', 'Ohio State', 'Penn State', 'Wisconsin'],
    'ccha': ['Augustana', 'Bemidji State', 'Bowling Green', 'Ferris State', 'Lake Superior', 'Michigan Tech', 
             'Minnesota State', 'Northern Michigan', 'St Thomas'],
    'ecac': ['Brown', 'Clarkson', 'Colgate', 'Cornell', 'Dartmouth', 'Harvard', 'Princeton', 'Quinnipiac',
             'Rensselaer', 'St Lawrence', 'Union', 'Yale'],
    'hockey_east': ['Boston College', 'Boston University', 'Connecticut', 'Maine', 'Massachusetts', 'Mass Lowell',
                    'Merrimack', 'New Hampshire', 'Northeastern', 'Providence', 'Vermont'],
    'nchc': ['Arizona State', 'Colorado College', 'Denver', 'Miami', 'Minnesota Duluth', 'North Dakota', 'Omaha', 
             'St Cloud State', 'Western Michigan'],
    'independents': ['Alaska Anchorage', 'Alaska', 'Lindenwood', 'Long Island', 'Stonehill']
}

# Create a list of all D1 teams
all_d1_teams = []
for teams in conference_mapping.values():
    all_d1_teams.extend(teams)

# Drop any rows of teams that are not in the list
team_travel_summary_reset = team_travel_summary_reset[team_travel_summary_reset['Team'].isin(all_d1_teams)]

# Display length of final table
print(f'Final team count: {len(team_travel_summary_reset)}')

Original team count: 67
Final team count: 64


In [10]:

# OUTPUT FINISHED TABLE TO data/output FOLDER
output_path = os.path.join('..', 'data', 'output', 'team_travel_summary_final_v1.csv')
team_travel_summary_reset.to_csv(output_path, index=False)

# Display the updated dataframe with the new column order
team_travel_summary_reset.head()

Unnamed: 0,Team,Total_Trips,N_trips,N_total_distance,N_AVG,N_shortest,N_longest,non_con_trips,non_con_total_distance,non_con_AVG,non_con_shortest,non_con_longest,con_trips,con_total_distance,con_AVG,con_shortest,con_longest,Closest_Team,Closest_Distance,Total_Closest_Matches
0,Air Force,9,1,600.52,600.52,600.52,600.52,2,1777.0,888.5,12.0,1765.0,6,9482.68,1580.45,1316.6,1771.36,Colorado College,12.0,2
1,Alaska,13,1,2721.99,2721.99,2721.99,2721.99,12,28671.48,2389.29,260.42,3275.4,0,0.0,0.0,0.0,0.0,Alaska Anchorage,260.42,6
2,Alaska Anchorage,12,0,0.0,0.0,0.0,0.0,12,31712.55,2642.71,260.42,3401.37,0,0.0,0.0,0.0,0.0,Alaska,260.42,6
3,American Intl,12,0,0.0,0.0,0.0,0.0,5,1045.97,209.19,51.98,565.24,7,1335.59,190.8,39.22,410.16,Massachusetts,18.77,1
4,Arizona State,10,1,1.75,1.75,1.75,1.75,3,4393.22,1464.41,548.65,2271.36,6,6358.26,1059.71,543.44,1563.33,Colorado College,543.44,4


## Calculate the Stats by Conference - NOT ACCURATE BASED CON CHECK OF N_AVG IN FINAL TABLE
- may be having the same issue as the other table aggrigation did

In [11]:
## SHow boston college schedule to quality check
# df_schedule_merged[(df_schedule_merged['Away_Team'] == 'Boston College') | (df_schedule_merged['Home_Team'] == 'Boston College')]

# Show BC in final table to quality check
team_travel_summary_reset[team_travel_summary_reset['Team'].str.contains('Boston')]

Unnamed: 0,Team,Total_Trips,N_trips,N_total_distance,N_AVG,N_shortest,N_longest,non_con_trips,non_con_total_distance,non_con_AVG,non_con_shortest,non_con_longest,con_trips,con_total_distance,con_AVG,con_shortest,con_longest,Closest_Team,Closest_Distance,Total_Closest_Matches
10,Boston College,14,1,5.84,5.84,5.84,5.84,2,1843.48,921.74,679.87,1163.62,11,555.82,50.53,2.86,179.12,Boston University,2.86,2
11,Boston University,14,2,2997.6,1498.8,2.99,2994.6,2,1435.92,717.96,117.52,1318.4,10,679.64,67.96,1.76,214.6,Harvard,1.08,1


In [12]:
# Define the conference membership mapping
conference_mapping = {
    'atlantic': ['Air Force', "American Intl", 'Army', 'Bentley', 'Canisius', 'Holy Cross', 'Mercyhurst', 
                 'Niagara', 'RIT', 'Robert Morris', 'Sacred Heart'],
    'big_ten': ['Michigan', 'Michigan State', 'Minnesota', 'Notre Dame', 'Ohio State', 'Penn State', 'Wisconsin'],
    'ccha': ['Augustana', 'Bemidji State', 'Bowling Green', 'Ferris State', 'Lake Superior', 'Michigan Tech', 
             'Minnesota State', 'Northern Michigan', 'St Thomas'],
    'ecac': ['Brown', 'Clarkson', 'Colgate', 'Cornell', 'Dartmouth', 'Harvard', 'Princeton', 'Quinnipiac',
             'Rensselaer', 'St Lawrence', 'Union', 'Yale'],
    'hockey_east': ['Boston College', 'Boston University', 'Connecticut', 'Maine', 'Massachusetts', 'Mass Lowell',
                    'Merrimack', 'New Hampshire', 'Northeastern', 'Providence', 'Vermont'],
    'nchc': ['Arizona State', 'Colorado College', 'Denver', 'Miami', 'Minnesota Duluth', 'North Dakota', 'Omaha', 
             'St Cloud State', 'Western Michigan'],
    'independents': ['Alaska Anchorage', 'Alaska', 'Lindenwood', 'Long Island', 'Stonehill']
}

# Reverse the mapping to map each team to its conference
team_to_conference = {team: conf for conf, teams in conference_mapping.items() for team in teams}

# Add a new column for Conference in the team travel summary
team_travel_summary_reset['Conference'] = team_travel_summary_reset['Team'].map(team_to_conference)

# Group by conference and calculate average values for each travel metric
conference_travel_stats = team_travel_summary_reset.groupby('Conference').agg(
    N_trips_AVG=('N_trips', 'mean'),
    N_total_distance_AVG=('N_total_distance', 'mean'),
    # N_AVG=('N_AVG', 'mean'),
    N_shortest_AVG=('N_shortest', 'mean'),
    N_longest_AVG=('N_longest', 'mean'),
    non_con_trips_AVG=('non_con_trips', 'mean'),
    non_con_total_distance_AVG=('non_con_total_distance', 'mean'),
    # non_con_AVG=('non_con_AVG', 'mean'),
    non_con_shortest_AVG=('non_con_shortest', 'mean'),
    non_con_longest_AVG=('non_con_longest', 'mean'),
    con_trips_AVG=('con_trips', 'mean'),
    con_total_distance_AVG=('con_total_distance', 'mean'),
    # con_AVG=('con_AVG', 'mean'),
    con_shortest_AVG=('con_shortest', 'mean'),
    con_longest_AVG=('con_longest', 'mean')
).reset_index()

# Calulate the average trip distance for each conference and each game type
conference_travel_stats['N_AVG'] = conference_travel_stats['N_total_distance_AVG'] / conference_travel_stats['N_trips_AVG']
conference_travel_stats['non_con_AVG'] = conference_travel_stats['non_con_total_distance_AVG'] / conference_travel_stats['non_con_trips_AVG']
conference_travel_stats['con_AVG'] = conference_travel_stats['con_total_distance_AVG'] / conference_travel_stats['con_trips_AVG']

# Claculate AVG_Total_Trips
conference_travel_stats['AVG_Total_Trips'] = conference_travel_stats['N_trips_AVG'] + conference_travel_stats['non_con_trips_AVG'] + conference_travel_stats['con_trips_AVG']

# Reorder so AVG_Total_Trips is first
columns_order_final = [
    'Conference', 'AVG_Total_Trips',
    'N_trips_AVG', 'N_total_distance_AVG', 'N_AVG', 'N_shortest_AVG', 'N_longest_AVG',
    'non_con_trips_AVG', 'non_con_total_distance_AVG', 'non_con_AVG', 'non_con_shortest_AVG', 'non_con_longest_AVG',
    'con_trips_AVG', 'con_total_distance_AVG', 'con_AVG', 'con_shortest_AVG', 'con_longest_AVG'
]

conference_travel_stats = conference_travel_stats[columns_order_final]

# Round all columns to 2 decimal places for better readability
conference_travel_stats = conference_travel_stats.round(2)

# Output the aggregated conference travel stats the data/output folder
output_path = os.path.join('..', 'data', 'output', 'conference_travel_stats_v1.csv')
conference_travel_stats.to_csv(output_path, index=False)

# Display the aggregated results for each conference
conference_travel_stats.head(6)


Unnamed: 0,Conference,AVG_Total_Trips,N_trips_AVG,N_total_distance_AVG,N_AVG,N_shortest_AVG,N_longest_AVG,non_con_trips_AVG,non_con_total_distance_AVG,non_con_AVG,non_con_shortest_AVG,non_con_longest_AVG,con_trips_AVG,con_total_distance_AVG,con_AVG,con_shortest_AVG,con_longest_AVG
0,atlantic,11.18,0.36,465.46,1280.03,465.46,465.46,3.0,792.11,264.04,127.38,502.69,7.82,3071.72,392.89,162.28,1244.12
1,big_ten,10.29,2.0,952.02,476.01,95.84,884.62,2.14,1379.59,643.81,327.15,1021.88,6.14,2032.64,330.89,153.12,616.01
2,ccha,10.0,0.78,293.35,377.16,88.23,250.01,2.56,1449.15,567.06,169.91,915.1,6.67,2173.39,326.01,124.21,582.66
3,ecac,14.25,0.75,458.19,610.92,30.7,433.56,2.33,424.15,181.78,80.92,245.56,11.17,1769.76,158.49,38.22,253.96
4,hockey_east,13.73,1.45,1468.11,1009.32,574.98,1183.25,2.18,828.59,379.77,140.65,649.66,10.09,864.14,85.64,44.87,215.91
5,independents,12.4,0.2,544.4,2721.99,544.4,544.4,12.2,15564.55,1275.78,196.6,2380.2,0.0,0.0,,0.0,0.0


## CHECKING AND VERIFICATION

In [13]:
## Show Boston COllege to validate
team_travel_summary_reset[team_travel_summary_reset['Team'].str.contains('Boston College')]

# # Show full Boston COllege schedule to validate
# df_schedule_merged[(df_schedule_merged['Away_Team'] == 'Boston College') | (df_schedule_merged['Home_Team'] == 'Boston College')]

# # Show Rows on the schedule from the Beanpot Tournament
df_schedule_merged[df_schedule_merged['Conference'].str.contains('Beanpot')]
# Show Great Lakes Invitational
# df_schedule_merged[df_schedule_merged['Conference'].str.contains('Great Lakes Invitational')]

Unnamed: 0,Date,Conference,Game_Notes,Away_Team,Away_Team_Link,Away_Score,Home_Team,Home_Team_Link,Home_Score,OT,...,Home_Distance,Travel_Flag,Adjusted_Away_Distance,Adjusted_Home_Distance,Team,Closest_Team_Home,Team_Closest_Away,Closest_Team_Away,Home_vs_Closest,Away_vs_Closest
382,2025-02-03,"Beanpot (at TD Garden, Boston)",,Harvard,/reports/team/Harvard/22,,Boston University,/reports/team/Boston-University/10,,,...,2.992651,1,3.215753,2.992651,Boston University,Harvard,Harvard,Boston University,True,True
719,2025-02-03,"Beanpot (at TD Garden, Boston)",,Northeastern,/reports/team/Northeastern/41,,Boston College,/reports/team/Boston-College/9,,,...,5.842697,1,2.245693,5.842697,Boston College,Boston University,Northeastern,Boston University,False,False


In [14]:
# Filter to show all neutral games in schedule
# df_schedule_merged[df_schedule_merged['Is_Neutral_Game']]
# Only Show Date, Conference , Game Notes, Away_Team, Home_Team, and Travel Flag
# df_schedule_merged[df_schedule_merged['Is_Neutral_Game']][['Date', 'Conference', 'Game_Notes', 'Away_Team', 'Home_Team', 'Travel_Flag']]