# Distance Notebook
### Calculate the distance each team will need to travel over the course of the season

#### Note

In [1]:
# Dependencies
import os
import sys
import pandas as pd
import numpy as np
import geopy
from geopy.distance import geodesic



In [2]:
## Paths to source data
# SCHEDULE FILE
schedule_path = os.path.join('..', 'data', 'schedule', '2024_current.csv')
schedule_data = pd.read_csv(schedule_path)
# ARENA INFO FILE
arena_path = os.path.join('..', 'data', 'arena_school_info.csv')
arena_data = pd.read_csv(arena_path)

# NEUTRAL SITE ARENA INFORMATION FILE
neutral_path = os.path.join('..', 'data', 'neutral_arenas_2024.csv')
neutral_arenas_df = pd.read_csv(neutral_path)

# Display data
# schedule_data.head()
# schedule_data.tail()
# arena_data.head()
# neutral_arenas_df.tail()

### Initial Transformation
- remove Exhibition games

In [3]:
## Drop Exhibition games from schedule
# If 'Exhibition' in Conference column, drop row

# Print Schedule length
print(f"Schedule length before dropping Exhibition games: {len(schedule_data)}")

# Drop rows with 'Exhibition' in Conference column
schedule_data = schedule_data[schedule_data['Conference'] != 'Exhibition']

# Print Schedule length
print(f"Schedule length after dropping Exhibition games: {len(schedule_data)}")


Schedule length before dropping Exhibition games: 1130
Schedule length after dropping Exhibition games: 1081


## Account for neutral site games

In [4]:
# Identify rows in the schedule table that involve neutral site games
# by checking if the 'Conference' or 'Game_Notes' contains a match with the 'Flag' in the neutral arenas table.

# Helper function to find if any flag appears in the Conference or Game_Notes columns
def is_neutral_game(row, flags):
    # Check for flags in both Conference and Game_Notes, ensuring correct handling of NaN
    conference_match = any(flag in str(row['Conference']) for flag in flags)
    notes_match = any(flag in str(row['Game_Notes']) for flag in flags)
    return conference_match or notes_match

# Extract the list of flags from the neutral arenas table (assuming already loaded neutral_arenas_df)
neutral_flags = neutral_arenas_df['Flag'].tolist()

# Apply the function to the schedule data to identify neutral site games
schedule_data['Is_Neutral_Game'] = schedule_data.apply(is_neutral_game, axis=1, flags=neutral_flags)

# Filter the schedule for neutral site games
neutral_site_games = schedule_data[schedule_data['Is_Neutral_Game']]

### Calculate the distance to neutral site locations

In [5]:
### refactor the code to include the distance between the two teams


# Helper function to calculate the distance between two points (lat, lon)
def calculate_distance(lat1, lon1, lat2, lon2):
    if pd.notnull(lat1) and pd.notnull(lon1) and pd.notnull(lat2) and pd.notnull(lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).miles
    else:
        return None  # Return None if any coordinates are missing

# Function to merge team location data and neutral arenas
def merge_team_and_arena_data(schedule_df, team_df, arena_df):
    # Merge team locations (away and home teams)
    schedule_df = schedule_df.merge(
        team_df[['Team', 'Latitude', 'Longitude']], 
        left_on='Away_Team', right_on='Team', how='left', suffixes=('', '_away')
    )
    schedule_df = schedule_df.merge(
        team_df[['Team', 'Latitude', 'Longitude']], 
        left_on='Home_Team', right_on='Team', how='left', suffixes=('', '_home')
    )

    # Merge on 'Conference' column first
    schedule_df = schedule_df.merge(
        arena_df[['Flag', 'latitude', 'longitude']], 
        left_on='Conference', right_on='Flag', how='left'
    )

    # Handle rows where the Conference merge did not work by checking 'Game_Notes'
    missing_coords_df = schedule_df[schedule_df['latitude'].isnull()].copy()

    def match_flag_in_game_notes(row, arena_df):
        for _, flag_row in arena_df.iterrows():
            if flag_row['Flag'] in str(row['Game_Notes']):
                return flag_row['latitude'], flag_row['longitude']
        return None, None

    # Apply partial matching function for missing coordinates
    missing_coords_df[['latitude', 'longitude']] = missing_coords_df.apply(
        lambda row: match_flag_in_game_notes(row, arena_df), axis=1, result_type="expand"
    )

    # Fill missing latitude/longitude
    schedule_df.loc[schedule_df['latitude'].isnull(), ['latitude', 'longitude']] = missing_coords_df[['latitude', 'longitude']]

    return schedule_df

# Output results
# output_path = '../TEMP/neutral_site_games_distances_TEST2.csv'
# neutral_site_games.to_csv(output_path, index=False)

## PRINT DF INFOR BEFORE THE CLEANING
# print(neutral_site_games.info())

In [6]:
### CLEANING BASED ON LOOK AT OUTPUT FROM CELL ABOVE


# Remove Rows that have TBD or a / in one of the team columns
neutral_site_games = neutral_site_games[~neutral_site_games['Home_Team'].str.contains('/')]
neutral_site_games = neutral_site_games[~neutral_site_games['Away_Team'].str.contains('/')]
neutral_site_games = neutral_site_games[~neutral_site_games['Home_Team'].str.contains('TBD')]
neutral_site_games = neutral_site_games[~neutral_site_games['Away_Team'].str.contains('TBD')]

# Print DF INFO AFTER CLEANING
# print(neutral_site_games.info())

# OUTPUT CSV FOR CHECKING INTO TEMP FOLDER
# output_path = '../TEMP/neutral_site_games.csv'
# neutral_site_games.to_csv(output_path, index=False)

In [7]:
# neutral_site_games.head()

In [8]:
## New Approach - create a new table and structure for the neutral site games - add them to the agg count at the end

## Assign a location to each game - based on the FLag column from neutral_arenas_df

# Drop the columns that are not needed
neutral_site_games = neutral_site_games.drop(columns=['Away_Team_Link', 'Away_Score', 'Home_Team_Link', 'Home_Score', 'OT', 'Box_Link', 'Metrics_Link'])

# Reinex the DF
neutral_site_games.reset_index(drop=True, inplace=True)

# If Game_Notes is NaN fill with Conference
neutral_site_games['Game_Notes'] = neutral_site_games['Game_Notes'].fillna(neutral_site_games['Conference'])

# Assign a location to each game
# Look for Game_Notes that contain the Flag from neutral_arenas_df
# If there is a match, assign the location to the game

# Helper function to assign latitude and longitude to each game
def assign_location(row, arena_df):
    for _, flag_row in arena_df.iterrows():
        if flag_row['Flag'] in str(row['Game_Notes']):
            return flag_row['latitude'], flag_row['longitude']
    return None, None

# Extract the list of flags from the neutral arenas table (assuming already loaded neutral_arenas_df)
neutral_flags = neutral_arenas_df['Flag'].tolist()

# Apply the function to the schedule data to identify neutral site games
neutral_site_games[['latitude', 'longitude']] = neutral_site_games.apply(assign_location, axis=1, arena_df=neutral_arenas_df, result_type="expand")

# If Gama_Notes is empty or an empty string, fill with Conference value
# neutral_site_games['Game_Notes'] = neutral_site_games['Game_Notes'].replace('', neutral_site_games['Conference']) # NOT WORKING AT ALL - THROWING ERROR
# neutral_site_games['Game_Notes'] = neutral_site_games['Game_Notes'].fillna(neutral_site_games['Conference']) # Not working must be empty string not NaN


# neutral_site_games.head()
# neutral_site_games.tail()
# neutral_site_games

In [9]:
### Calculate the distance between the two teams for each game

# Helper function to calculate the distance between two points (lat, lon)
def calculate_distance(lat1, lon1, lat2, lon2):
    if pd.notnull(lat1) and pd.notnull(lon1) and pd.notnull(lat2) and pd.notnull(lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).miles
    else:
        return None  # Return None if any coordinates are missing

# Function to calculate distances to neutral site
def calculate_team_distances(schedule_df):
    schedule_df['Away_Distance'] = schedule_df.apply(
        lambda row: calculate_distance(row['Latitude'], row['Longitude'], row['latitude'], row['longitude']), axis=1
    )
    schedule_df['Home_Distance'] = schedule_df.apply(
        lambda row: calculate_distance(row['Latitude_home'], row['Longitude_home'], row['latitude'], row['longitude']), axis=1
    )

    return schedule_df

# Refactor into steps
neutral_site_games = merge_team_and_arena_data(schedule_data, arena_data, neutral_arenas_df)
neutral_site_games = calculate_team_distances(neutral_site_games)

# Filter out rows with missing distances
neutral_site_games = neutral_site_games.dropna(subset=['Away_Distance', 'Home_Distance'])


# Output results
# output_path = '../TEMP/neutral_site_games_distances_TESTv3.csv'
# neutral_site_games.to_csv(output_path, index=False)

# neutral_site_games.head()
# neutral_site_games.tail()
# neutral_site_games.info()

In [10]:
### DESIRED OUTPUT
# Date, Game_ID, Game_Notes, Team, Distance for each team in each game

# Create a new DataFrame to store the results
neutral_site_games_agg = pd.DataFrame(columns=['Date', 'Game_ID', 'Game_Notes', 'Team', 'N_Distance'])

# Iterate through the neutral_site_games DataFrame and add the data to the new DataFrame
rows = []  # Use a list to accumulate rows for better performance
for index, row in neutral_site_games.iterrows():
    # Add the Away Team data
    rows.append({
        'Date': row['Date'],
        'Game_ID': row['Game_ID'],
        'Game_Notes': row['Game_Notes'],
        'Team': row['Away_Team'],
        'N_Distance': row['Away_Distance']
    })
    
    # Add the Home Team data
    rows.append({
        'Date': row['Date'],
        'Game_ID': row['Game_ID'],
        'Game_Notes': row['Game_Notes'],
        'Team': row['Home_Team'],
        'N_Distance': row['Home_Distance']
    })

# Convert the list of rows into a DataFrame and concatenate
neutral_site_games_agg_1 = pd.concat([neutral_site_games_agg, pd.DataFrame(rows)], ignore_index=True)

# Output results
output_path = '../TEMP/neutral_site_games_agg.csv'
neutral_site_games_agg_1.to_csv(output_path, index=False)

# Display the first and last few rows of the DataFrame
neutral_site_games_agg_1.head()
# neutral_site_games_agg_1.tail()

  neutral_site_games_agg_1 = pd.concat([neutral_site_games_agg, pd.DataFrame(rows)], ignore_index=True)


Unnamed: 0,Date,Game_ID,Game_Notes,Team,N_Distance
0,2024-10-11,2024-10-11_Massachusetts_Omaha,,Omaha,1093.593499
1,2024-10-11,2024-10-11_Massachusetts_Omaha,,Massachusetts,2304.523378
2,2024-10-11,2024-10-11_Air Force_Minnesota,,Minnesota,1302.776665
3,2024-10-11,2024-10-11_Air Force_Minnesota,,Air Force,600.521127
4,2024-10-12,2024-10-12_North Dakota_Providence,US Hockey Hall of Fame game,Providence,1319.532097


## NON NEUTRAL GAMES

In [11]:
# Merge the schedule data with the arena data to include home and away team locations

# First, ensure team names match between datasets
# We will merge on the 'Team' column in the arena data and 'Home_Team'/'Away_Team' in the schedule data
merged_data = schedule_data.merge(arena_data[['Team', 'Latitude', 'Longitude']], left_on='Home_Team', right_on='Team', how='left')
merged_data = merged_data.rename(columns={'Latitude': 'Home_Latitude', 'Longitude': 'Home_Longitude'})

# Merge again for the away teams
merged_data = merged_data.merge(arena_data[['Team', 'Latitude', 'Longitude']], left_on='Away_Team', right_on='Team', how='left')
merged_data = merged_data.rename(columns={'Latitude': 'Away_Latitude', 'Longitude': 'Away_Longitude'})

# Drop the unnecessary 'Team' columns from the merged data
merged_data = merged_data.drop(columns=['Team_x', 'Team_y'])

# Display the first few rows of the merged data to verify the result
# merged_data.head()

### Calculate the distance between each school
- Using Haversine equation to calculate the straight line distance between two sets of lat/lon coodinates

In [12]:
## VERSION 2 USING GEOPY
# Function to calculate distance with NaN check
def calculate_distance(row):
    # Check for NaN values in lat/long coordinates
    if (np.isnan(row['Home_Latitude']) or np.isnan(row['Home_Longitude']) or
        np.isnan(row['Away_Latitude']) or np.isnan(row['Away_Longitude'])):
        return np.nan  # Return NaN if any of the coordinates are missing
    else:
        # Calculate the distance if all coordinates are present
        return geodesic((row['Home_Latitude'], row['Home_Longitude']), 
                        (row['Away_Latitude'], row['Away_Longitude'])).miles

# Apply the function to calculate the distance between the home and away arenas
merged_data['Distance_Miles'] = merged_data.apply(calculate_distance, axis=1)

# Display the updated data with the calculated distance
merged_data[['Home_Team', 'Away_Team', 'Distance_Miles']].head()


Unnamed: 0,Home_Team,Away_Team,Distance_Miles
0,Lake Superior,Michigan State,259.792866
1,Michigan,Minnesota State,534.090775
2,Air Force,Arizona State,548.654698
3,Union,Providence,144.15153
4,Boston University,Holy Cross,36.229678


### Filter out results to avoid double counting games on weekend series

- Travel_Flag to account for consecutive games played at the same venue within a 3-day span. If a team plays multiple games at the same venue within this period, travel is only counted for the first game.

- The Adjusted_Travel_Distance column reflects the distance a team will travel for each game, considering the consecutive game rule.

In [13]:
### VERSION 2
# Update logic to handle non-consecutive rows by grouping first

# Convert 'Date' column to datetime format for easier manipulation
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

# Reset the travel flag
merged_data['Travel_Flag'] = 1

# Sort the data by 'Away_Team', 'Home_Team', and 'Date' to ensure games are grouped correctly
merged_data = merged_data.sort_values(by=['Away_Team', 'Home_Team', 'Date'])

# Group by 'Away_Team' and 'Home_Team', then iterate through each group to set the travel flag
for (away_team, home_team), group in merged_data.groupby(['Away_Team', 'Home_Team']):
    group = group.sort_values(by='Date')  # Sort by date within each group
    
    # Iterate through the group to check for consecutive games
    for i in range(1, len(group)):
        current_game = group.iloc[i]
        previous_game = group.iloc[i - 1]
        
        # Check if the games are within 3 days
        if (current_game['Date'] - previous_game['Date']).days <= 3:
            # Set the travel flag to 0 for the current game
            merged_data.loc[current_game.name, 'Travel_Flag'] = 0

# Only consider rows where travel flag is 1 for calculating total travel distance
merged_data['Adjusted_Travel_Distance'] = merged_data['Distance_Miles'] * merged_data['Travel_Flag']

# Display the updated data with the travel flag and adjusted distance
merged_data[['Away_Team', 'Home_Team', 'Date', 'Distance_Miles', 'Travel_Flag', 'Adjusted_Travel_Distance']].head()

# OUTPUT TABLE TO TEMP FILE FOR TESTING
output_path = os.path.join('..', 'TEMP', 'schedule_w_distance.csv')
merged_data.to_csv(output_path, index=False)

schedule_w_distance = merged_data.copy()

## Aggregate Total Travel Distance for Each Team AND
## Calculate and store the Trip Count and the Average trip distance

In [14]:
# Only consider rows where travel flag is 1 for calculating total travel distance
merged_data['Adjusted_Travel_Distance'] = merged_data['Distance_Miles'] * merged_data['Travel_Flag']

# Calculate the total travel distance per team
team_travel_distances = merged_data.groupby('Away_Team')['Adjusted_Travel_Distance'].sum().reset_index()
team_travel_distances.columns = ['Team', 'Total_Travel_Distance']

# Step 4: Adding Trip Count and Average Trip Distance

# Calculate the number of trips for each team
trip_count = merged_data[merged_data['Travel_Flag'] == 1].groupby('Away_Team').size().reset_index(name='Trip_Count')

# Merge trip count with travel distances
team_travel_data = pd.merge(team_travel_distances, trip_count, left_on='Team', right_on='Away_Team', how='left').drop(columns='Away_Team')

# Calculate average trip distance
team_travel_data['Average_Trip_Distance'] = team_travel_data['Total_Travel_Distance'] / team_travel_data['Trip_Count']

# Display the top 5 teams with the highest average trip distance
team_travel_data = team_travel_data.sort_values(by='Average_Trip_Distance', ascending=False)
team_travel_data.head()

Unnamed: 0,Team,Total_Travel_Distance,Trip_Count,Average_Trip_Distance
1,Alaska,25759.663688,12,2146.638641
0,Air Force,9555.349393,9,1061.705488
4,Arizona State,8102.296536,9,900.255171
54,Penn State,6464.519475,9,718.279942
23,Denver,7047.934161,10,704.793416


## Find The Closest Other Team to Each team and Store

In [15]:
from geopy.distance import geodesic
import pandas as pd

# Helper function to calculate the distance between two points (lat, lon)
def calculate_distance(lat1, lon1, lat2, lon2):
    if pd.notnull(lat1) and pd.notnull(lon1) and pd.notnull(lat2) and pd.notnull(lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).miles
    else:
        return None  # Return None if any coordinates are missing

# Function to find the closest team to a specific team in the arena data
def find_closest_team(current_team_row, team_data):
    # Initialize variables to store the closest team and distance
    closest_team = None
    closest_distance = np.inf
    
    # Iterate over each row in the team data
    for _, row in team_data.iterrows():
        # Calculate the distance between the current team and the other team
        distance = calculate_distance(current_team_row['Latitude'], current_team_row['Longitude'], row['Latitude'], row['Longitude'])
        
        # Update the closest team if the distance is smaller
        if distance is not None and distance < closest_distance:
            closest_team = row['Team']
            closest_distance = distance
    
    return closest_team, closest_distance

# Find the closest team to each team in the arena data
closest_teams = []
closest_distances = []

# Iterate over each row in the arena data to find the closest team
for _, row in arena_data.iterrows():
    # Exclude the current team from the comparison
    other_teams = arena_data[arena_data['Team'] != row['Team']]
    closest_team, closest_distance = find_closest_team(row, other_teams)
    
    closest_teams.append(closest_team)
    closest_distances.append(closest_distance)

# Add the closest team and distance to the arena data
arena_data['Closest_Team'] = closest_teams
arena_data['Closest_Distance'] = closest_distances

# Merge the closest team data with the team travel data
team_travel_data = team_travel_data.merge(arena_data[['Team', 'Closest_Team', 'Closest_Distance']], on='Team', how='left')

# Sort by closest distance to another team
team_travel_data = team_travel_data.sort_values(by='Closest_Distance')

# Display the top 5 teams with the highest average trip distance and closest team information
# team_travel_data.tail(25)

# Add The neutral site data to the aggrigated results

In [16]:
## REFACTOR

# Group by team to calculate total neutral site distances and count neutral site games

neutral_site_games_agg = neutral_site_games_agg_1.groupby('Team').agg({'N_Distance': 'sum', 'Game_ID': 'nunique'}).reset_index()
neutral_site_games_agg.rename(columns={'Game_ID': 'Neutral_Site_Trips'}, inplace=True)

# Merge the neutral site data (distances and game counts) with the team travel data
team_travel_data_refactored = pd.merge(team_travel_data, neutral_site_games_agg, on='Team', how='left')

# Fill missing values for teams without neutral site games
team_travel_data_refactored['N_Distance'] = team_travel_data_refactored['N_Distance'].fillna(0)
team_travel_data_refactored['Neutral_Site_Trips'] = team_travel_data_refactored['Neutral_Site_Trips'].fillna(0)

# Calculate regular trip stats (excluding neutral site games)
team_travel_data_refactored['Reg_Distance'] = team_travel_data_refactored['Total_Travel_Distance']
team_travel_data_refactored['Reg_Trips'] = team_travel_data_refactored['Trip_Count']
team_travel_data_refactored['Reg_AVG'] = team_travel_data_refactored['Reg_Distance'] / team_travel_data_refactored['Reg_Trips']

# Calculate total distance and average with neutral site trips included
team_travel_data_refactored['Total_Distance'] = team_travel_data_refactored['Reg_Distance'] + team_travel_data_refactored['N_Distance']
team_travel_data_refactored['N_AVG'] = team_travel_data_refactored['N_Distance'] / team_travel_data_refactored['Neutral_Site_Trips']
team_travel_data_refactored['Overall_AVG'] = team_travel_data_refactored['Total_Distance'] / (team_travel_data_refactored['Reg_Trips'] + team_travel_data_refactored['Neutral_Site_Trips'])

# Select and reorder the columns
team_travel_data_refactored = team_travel_data_refactored[[
    'Team', 
    'Reg_Distance', 
    'Reg_Trips', 
    'Reg_AVG', 
    'N_Distance', 
    'Neutral_Site_Trips', 
    'N_AVG', 
    'Total_Distance', 
    'Overall_AVG'
]]

# Add the Closest Team and CTeam_Distance columns back at the end of the table
team_travel_data_refactored = pd.merge(team_travel_data_refactored, 
                                       team_travel_data[['Team', 'Closest_Team', 'Closest_Distance']], 
                                       on='Team', how='left')

# Reorder the columns to place Closest Team and CTeam_Distance at the end
team_travel_data_refactored = team_travel_data_refactored[[
    'Team', 
    'Reg_Distance', 
    'Reg_Trips', 
    'Reg_AVG', 
    'N_Distance', 
    'Neutral_Site_Trips', 
    'N_AVG', 
    'Total_Distance', 
    'Overall_AVG', 
    'Closest_Team', 
    'Closest_Distance'
]]

# Drop rows with 0 regular travel distance
team_travel_data = team_travel_data_refactored[team_travel_data_refactored['Reg_Distance'] != 0]


## Find How many times each team plays their closest rival

In [17]:
## Rename Schedule dataframe

df_schedule = schedule_data.copy()

# Remove Exhibition games from schedule
df_schedule = df_schedule[df_schedule['Conference'] != 'Exhibition']


# Match the teams in the schedule with their closest team from the travel distance data
# Extract the teams from the schedule and cross-check against the closest team

# Merging schedule data with closest team info for both home and away teams
df_schedule_merged = schedule_data.merge(
    team_travel_data[['Team', 'Closest_Team']],
    left_on='Home_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Home')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Home'}, inplace=True)

df_schedule_merged = df_schedule_merged.merge(
    team_travel_data[['Team', 'Closest_Team']],
    left_on='Away_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Away')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Away'}, inplace=True)

# Now, let's ensure both teams (home and away) are being compared properly
df_schedule_merged['Home_vs_Closest'] = df_schedule_merged['Away_Team'] == df_schedule_merged['Closest_Team_Home']
df_schedule_merged['Away_vs_Closest'] = df_schedule_merged['Home_Team'] == df_schedule_merged['Closest_Team_Away']

# Count how many times each team plays its closest opponent as either home or away
df_closest_match_count_home = df_schedule_merged.groupby('Home_Team').agg({
    'Home_vs_Closest': 'sum'
}).reset_index()

df_closest_match_count_away = df_schedule_merged.groupby('Away_Team').agg({
    'Away_vs_Closest': 'sum'
}).reset_index()

# Merge both home and away counts to ensure consistency for both teams
df_closest_match_total = pd.merge(
    df_closest_match_count_home, 
    df_closest_match_count_away, 
    left_on='Home_Team', 
    right_on='Away_Team', 
    how='outer'
)

# Replace missing values with 0 before summing up
df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)

# Calculate the total closest matches by summing up both columns
df_closest_match_total['Total_Closest_Matches'] = df_closest_match_total['Home_vs_Closest'] + df_closest_match_total['Away_vs_Closest']

# Rename columns for clarity and drop unneeded ones
df_closest_match_total = df_closest_match_total[['Home_Team', 'Total_Closest_Matches']].rename(columns={'Home_Team': 'Team'})

# Merge this back into the travel data
team_travel_data_final = team_travel_data.merge(
    df_closest_match_total[['Team', 'Total_Closest_Matches']],
    on='Team',
    how='left'
)

# Fill missing values with 0
team_travel_data_final['Total_Closest_Matches'].fillna(0, inplace=True)


# # Merge the schedule with the closest team data for both home and away teams
# df_schedule_merged = df_schedule.merge(team_travel_data[['Team', 'Closest_Team']], 
#                                        left_on='Home_Team', right_on='Team', how='left')

# # Now, let's check how many games each team plays against their closest team (both home and away)
# df_schedule_merged['Home_vs_Closest'] = df_schedule_merged['Away_Team'] == df_schedule_merged['Closest_Team']

# # Do the same for the away teams
# df_schedule_merged = df_schedule_merged.merge(team_travel_data[['Team', 'Closest_Team']], 
#                                               left_on='Away_Team', right_on='Team', how='left', suffixes=('_home', '_away'))

# df_schedule_merged['Away_vs_Closest'] = df_schedule_merged['Home_Team'] == df_schedule_merged['Closest_Team_away']

# # Now count how many times each team plays against their closest team, either as home or away
# df_closest_match_count = df_schedule_merged.groupby('Team_home').agg({
#     'Home_vs_Closest': 'sum',
#     'Away_vs_Closest': 'sum'
# }).reset_index()

# df_closest_match_count['Total_Closest_Matches'] = df_closest_match_count['Home_vs_Closest'] + df_closest_match_count['Away_vs_Closest']

# # Merge this back into the original travel data
# team_travel_data = team_travel_data.merge(df_closest_match_count[['Team_home', 'Total_Closest_Matches']],
#                                           left_on='Team', right_on='Team_home', how='left').drop(columns=['Team_home'])

team_travel_data_final.head()

## OUTPUT TO TEMP DIRECTORY
output_path = os.path.join('..', 'TEMP', 'team_travel_data_test_new_v3.csv')
team_travel_data_final.to_csv(output_path, index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

In [18]:
# team_travel_data_expanded_cleaned

## Find Each Teams Longest Trip of the Year
- and add to dataframe

In [19]:
# For regular season games, capture Away team, Home team, and distance
away_team_info = schedule_w_distance[schedule_w_distance['Travel_Flag'] == 1][['Away_Team', 'Home_Team', 'Distance_Miles']]
away_team_info['Game_Type'] = 'Regular'

# For neutral site games, capture both teams and distance, along with the Game_Type
neutral_team_info = neutral_site_games_agg_1[['Game_ID', 'Team', 'N_Distance']]

# Create a dataframe with both Team 1 and Team 2 as opponents for neutral site games
neutral_team_info['Opponent'] = neutral_team_info['Game_ID'].str.split('_').str[2]  # Assuming the other team is in the Game_ID

# Duplicate rows to handle both teams in neutral site games
neutral_team_info_team1 = neutral_team_info[['Game_ID', 'Team', 'Opponent', 'N_Distance']].rename(columns={'N_Distance': 'Distance'})
neutral_team_info_team2 = neutral_team_info[['Game_ID', 'Opponent', 'Team', 'N_Distance']].rename(columns={'N_Distance': 'Distance', 'Opponent': 'Team', 'Team': 'Opponent'})

# Combine both team datasets for neutral site games
neutral_combined = pd.concat([neutral_team_info_team1, neutral_team_info_team2])
neutral_combined['Game_Type'] = 'Neutral'

# Regular season away games: Home_Team is the opponent
away_team_info = away_team_info.rename(columns={'Away_Team': 'Team', 'Home_Team': 'Opponent', 'Distance_Miles': 'Distance'})

# Combine both datasets (regular season and neutral site games)
combined_info = pd.concat([away_team_info[['Team', 'Opponent', 'Distance', 'Game_Type']], 
                           neutral_combined[['Team', 'Opponent', 'Distance', 'Game_Type']]])

# Clean the combined dataset by dropping rows with missing values in 'Team' or 'Distance'
combined_info_cleaned = combined_info.dropna(subset=['Team', 'Distance'])

# **Hotfix: Filter out games where the distance is 0**
combined_info_cleaned = combined_info_cleaned[combined_info_cleaned['Distance'] > 0]

## If Both Team and Opponent match drop the row
combined_info_cleaned = combined_info_cleaned[combined_info_cleaned['Team'] != combined_info_cleaned['Opponent']]
# Drop any rows with a /
combined_info_cleaned = combined_info_cleaned[~combined_info_cleaned['Opponent'].str.contains('/')]
combined_info_cleaned = combined_info_cleaned[~combined_info_cleaned['Team'].str.contains('/')]

# Now find the longest trip for each team, including the opponent and game type
longest_trip_info_cleaned = combined_info_cleaned.loc[combined_info_cleaned.groupby('Team')['Distance'].idxmax()].reset_index(drop=True)

# Merge the cleaned longest trip information back into the team_travel_data
team_travel_data_expanded_cleaned = pd.merge(team_travel_data_final, longest_trip_info_cleaned, on='Team', how='left')

# Drop the duplicate rows that were created
team_travel_data_expanded_cleaned = team_travel_data_expanded_cleaned.drop_duplicates(subset=['Team'])
# Reindex the DataFrame
team_travel_data_expanded_cleaned.reset_index(drop=True, inplace=True)

# team_travel_data_expanded_cleaned.head(20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neutral_team_info['Opponent'] = neutral_team_info['Game_ID'].str.split('_').str[2]  # Assuming the other team is in the Game_ID


In [20]:
# Sort by Distance

team_travel_data_expanded_cleaned.head(5)


Unnamed: 0,Team,Reg_Distance,Reg_Trips,Reg_AVG,N_Distance,Neutral_Site_Trips,N_AVG,Total_Distance,Overall_AVG,Closest_Team,Closest_Distance,Total_Closest_Matches,Opponent,Distance,Game_Type
0,Harvard,2495.793588,17,146.811388,2994.249148,1.0,2994.249148,5490.042736,305.002374,Boston University,1.075804,1.0,Notre Dame,3606.351007,Neutral
1,Boston University,2092.731396,13,160.979338,2994.6045,1.0,2994.6045,5087.335896,363.381135,Harvard,1.075804,1.0,Merrimack,2994.6045,Neutral
2,Northeastern,2610.852411,16,163.178276,111.693419,1.0,111.693419,2722.54583,160.149755,Boston University,1.756704,2.0,Denver,1768.308428,Regular
3,Brown,1675.803811,13,128.907985,0.0,0.0,,1675.803811,128.907985,Providence,2.043651,1.0,Clarkson,267.072502,Regular
4,Providence,2121.166102,15,141.411073,1666.883994,3.0,555.627998,3788.050096,210.447228,Brown,2.043651,1.0,North Dakota,1319.532097,Regular


### Added data clean and transform steps

In [21]:
## Some added transformation steps
# Fill Nan Values with 0
team_travel_data_expanded_cleaned = team_travel_data_expanded_cleaned.fillna(0)

# Set Neutral Site Trips to integer
team_travel_data_expanded_cleaned['Neutral_Site_Trips'] = team_travel_data_expanded_cleaned['Neutral_Site_Trips'].astype(int)
team_travel_data_expanded_cleaned['Total_Closest_Matches'] = team_travel_data_expanded_cleaned['Total_Closest_Matches'].astype(int)

# Round the floats to 2 decimal places
team_travel_data_expanded_cleaned = team_travel_data_expanded_cleaned.round({'Reg_Distance': 2, 'Reg_Trips': 2, 'Reg_AVG': 2,
                                                                          'N_Distance': 2, 'Neutral_Site_Trips': 2, 'N_AVG': 2,
                                                                          'Total_Distance': 2, 'Overall_AVG': 2, 'Closest_Distance': 2,
                                                                          'Total_Closest_Matches': 2, 'Distance': 2})

## Rename some Columns
# Opponent to Longest_Trip_Opponent
# Distance_Longest_Trip
# Game_Type to Game_Type_Longest_Trip
team_travel_data_expanded_cleaned = team_travel_data_expanded_cleaned.rename(columns={'Opponent': 'Longest_Trip_Opponent',
                                                                                    'Distance': 'Distance_Longest_Trip',
                                                                                    'Game_Type': 'Game_Type_Longest_Trip'})

# Display The Resulting DataFrame
team_travel_data_expanded_cleaned.head(10)

Unnamed: 0,Team,Reg_Distance,Reg_Trips,Reg_AVG,N_Distance,Neutral_Site_Trips,N_AVG,Total_Distance,Overall_AVG,Closest_Team,Closest_Distance,Total_Closest_Matches,Longest_Trip_Opponent,Distance_Longest_Trip,Game_Type_Longest_Trip
0,Harvard,2495.79,17,146.81,2994.25,1,2994.25,5490.04,305.0,Boston University,1.08,1,Notre Dame,3606.35,Neutral
1,Boston University,2092.73,13,160.98,2994.6,1,2994.6,5087.34,363.38,Harvard,1.08,1,Merrimack,2994.6,Neutral
2,Northeastern,2610.85,16,163.18,111.69,1,111.69,2722.55,160.15,Boston University,1.76,2,Denver,1768.31,Regular
3,Brown,1675.8,13,128.91,0.0,0,0.0,1675.8,128.91,Providence,2.04,1,Clarkson,267.07,Regular
4,Providence,2121.17,15,141.41,1666.88,3,555.63,3788.05,210.45,Brown,2.04,1,North Dakota,1319.53,Regular
5,Boston College,1212.77,13,93.29,0.0,0,0.0,1212.77,93.29,Boston University,2.86,2,Michigan State,679.87,Regular
6,Bentley,3584.03,12,298.67,0.0,0,0.0,3584.03,298.67,Boston College,4.32,0,Air Force,1771.36,Regular
7,Yale,1185.82,11,107.8,33.72,1,33.72,1219.53,101.63,Quinnipiac,6.74,2,Clarkson,253.88,Regular
8,Quinnipiac,1945.51,16,121.59,100.03,2,50.02,2045.54,113.64,Yale,6.74,2,Maine,322.28,Regular
9,Minnesota,3993.51,11,363.05,1302.78,1,1302.78,5296.29,441.36,St Thomas,9.02,0,Penn State,830.01,Regular


# Output the Final Table to CSV

In [22]:
# # TEMP FOLDER Output
# output_path = os.path.join('..', 'TEMP', 'FINAL_OUT_team_travel_data_v2.csv')
# team_travel_data_expanded_cleaned.to_csv(output_path, index=False)
# print(f"Output saved to: {output_path}")

# OUTPUT INTO DATA OUTPUT FOLDER
output_path = os.path.join('..', 'data', 'output', 'Team_Travel_Information_v1.csv')
team_travel_data_expanded_cleaned.to_csv(output_path, index=False)
print(f"Output saved to: {output_path}")

Output saved to: ..\data\output\Team_Travel_Information_v1.csv
