## TEMP REWORK NOTEBOOK FOR DISTANCE CALC TABLE

In [68]:
# Dependencies
import os
import sys
import pandas as pd
import numpy as np
import geopy
from geopy.distance import geodesic

In [69]:
## Paths to source data

# schedule_path = os.path.join('..', 'data', 'schedule', '2024_current.csv')
schedule_path = os.path.join('..', 'data', 'schedule', 'CHN_Schedule_First Pass_v1.csv')
schedule_data = pd.read_csv(schedule_path)
raw_schedule_df = schedule_data.copy()

# ARENA INFO FILE
arena_path = os.path.join('..', 'data', 'arena_school_info.csv')
arena_data = pd.read_csv(arena_path)
arena_info_df = arena_data.copy()

# NEUTRAL SITE ARENA INFORMATION FILE
neutral_path = os.path.join('..', 'data', 'neutral_arenas_2024.csv')
neutral_arenas_df = pd.read_csv(neutral_path)

# Display data
# schedule_data.head()
# schedule_data.tail()
# arena_data.head()
# neutral_arenas_df.tail()

In [70]:
# Complete refactored code block with corrections
## Ensure the Data column in the schedule data is in datetime format
raw_schedule_df['Date'] = pd.to_datetime(raw_schedule_df['Date'])

from geopy.distance import geodesic

# ## Remove ' from all Team columns
raw_schedule_df['Home_Team'] = raw_schedule_df['Home_Team'].str.replace("'", "")
raw_schedule_df['Away_Team'] = raw_schedule_df['Away_Team'].str.replace("'", "")

# Helper function to calculate distance between two sets of coordinates
def calculate_distance(coord1, coord2):
    if pd.notna(coord1[0]) and pd.notna(coord2[0]):
        return geodesic(coord1, coord2).miles
    return 0

# Step 1: Filter out exhibition games, games with "TBA", and games with "/" in team names
raw_schedule_filtered = raw_schedule_df[
    (raw_schedule_df['Conference'] != 'Exhibition') &
    (~raw_schedule_df['Away_Team'].str.contains('TBA')) &
    (~raw_schedule_df['Home_Team'].str.contains('TBA')) &
    (~raw_schedule_df['Away_Team'].str.contains('/')) &
    (~raw_schedule_df['Home_Team'].str.contains('/'))
]

# Step 2: Flag neutral site games using the 'Flag' column from the neutral arenas table
def is_neutral_game(row, flags):
    # Check for flags in both Conference and Game_Notes columns
    conference_match = any(flag in str(row['Conference']) for flag in flags)
    notes_match = any(flag in str(row['Game_Notes']) for flag in flags)
    return conference_match or notes_match

neutral_flags = neutral_arenas_df['Flag'].tolist()
raw_schedule_filtered['Is_Neutral_Game'] = raw_schedule_filtered.apply(is_neutral_game, axis=1, flags=neutral_flags)

# Step 3: Merge arena coordinates for home and away teams
# Merge home team coordinates
schedule_with_coords = raw_schedule_filtered.merge(arena_info_df[['Team', 'Latitude', 'Longitude']], 
                                                   left_on='Home_Team', right_on='Team', how='left')
schedule_with_coords = schedule_with_coords.rename(columns={'Latitude': 'Home_Lat', 'Longitude': 'Home_Lon'})

# Merge away team coordinates
schedule_with_coords = schedule_with_coords.merge(arena_info_df[['Team', 'Latitude', 'Longitude']], 
                                                  left_on='Away_Team', right_on='Team', how='left')
schedule_with_coords = schedule_with_coords.rename(columns={'Latitude': 'Away_Lat', 'Longitude': 'Away_Lon'})

# Step 4: Categorize games into on-campus conference, on-campus non-conference, and neutral site
def categorize_game(row):
    if row['Is_Neutral_Game']:
        return 'Neutral'
    elif row['Conference'] == 'Non-Conference':
        return 'On-Campus Non-Conference'
    else:
        return 'On-Campus Conference'

schedule_with_coords['Game_Type'] = schedule_with_coords.apply(categorize_game, axis=1)

# Step 5: Calculate the distance for all games
schedule_with_coords['Distance'] = schedule_with_coords.apply(
    lambda row: calculate_distance((row['Away_Lat'], row['Away_Lon']), (row['Home_Lat'], row['Home_Lon'])), axis=1)

# Adjust the neutral site distance calculation using the neutral arena coordinates for both teams
def calculate_neutral_distance(row, neutral_df):
    if row['Is_Neutral_Game']:
        # Find the neutral site coordinates from the neutral arenas table
        neutral_site = neutral_df[neutral_df['Flag'].apply(lambda x: x in str(row['Conference']) or x in str(row['Game_Notes']))]
        
        if not neutral_site.empty:
            neutral_lat = neutral_site.iloc[0]['latitude']
            neutral_lon = neutral_site.iloc[0]['longitude']
            # Calculate distance from away team to neutral site
            away_to_neutral = calculate_distance((row['Away_Lat'], row['Away_Lon']), (neutral_lat, neutral_lon))
            # Calculate distance from home team to neutral site
            home_to_neutral = calculate_distance((row['Home_Lat'], row['Home_Lon']), (neutral_lat, neutral_lon))
            # Return both distances separately to track for both teams
            return away_to_neutral, home_to_neutral
    # Non-neutral games: apply the same distance for both teams (travel for the away team)
    return row['Distance'], row['Distance']

# Apply the new logic for neutral site games using the correct neutral coordinates
schedule_with_coords[['Away_Distance', 'Home_Distance']] = schedule_with_coords.apply(calculate_neutral_distance, axis=1, result_type='expand', neutral_df=neutral_arenas_df)

### NEW CODE ###
### NEW CODE ###
# Convert 'Date' column to datetime format for easier manipulation
schedule_with_coords['Date'] = pd.to_datetime(schedule_with_coords['Date'])

# Reset the travel flag (initialize to 1 for all games)
schedule_with_coords['Travel_Flag'] = 1

# Sort the data by 'Away_Team', 'Date', and 'Home_Team' to ensure games are grouped correctly
schedule_with_coords = schedule_with_coords.sort_values(by=['Away_Team', 'Date', 'Home_Team'])

# Group by 'Away_Team' to identify consecutive away games
for away_team, group in schedule_with_coords.groupby('Away_Team'):
    group = group.sort_values(by='Date')  # Sort by date within each group

    # Iterate through the group to check for consecutive games
    for i in range(1, len(group)):
        current_game = group.iloc[i]
        previous_game = group.iloc[i - 1]

        # Check if the games are within 3 days and are at the same venue OR if either is a neutral game
        if (current_game['Date'] - previous_game['Date']).days <= 3 and \
           ((current_game['Home_Lat'] == previous_game['Home_Lat'] and \
             current_game['Home_Lon'] == previous_game['Home_Lon']) or \
            current_game['Is_Neutral_Game'] or previous_game['Is_Neutral_Game']):
            # Set the travel flag to 0 for the current game (indicating no new trip)
            schedule_with_coords.loc[current_game.name, 'Travel_Flag'] = 0

# If the 'Distance' is 0, make sure 'Travel_Flag' is also 0
# schedule_with_coords.loc[schedule_with_coords['Distance'] == 0, 'Travel_Flag'] = 0

# # Only consider rows where Travel_Flag is 1 for calculating total travel distance
schedule_with_coords['Adjusted_Away_Distance'] = schedule_with_coords['Away_Distance'] * schedule_with_coords['Travel_Flag']
schedule_with_coords['Adjusted_Home_Distance'] = schedule_with_coords['Home_Distance'] * schedule_with_coords['Travel_Flag']


#####################################
# Step 8: Group by team and game type for both away and home travel distances

# Define a function to get the shortest non-zero distance
def shortest_non_zero(series):
    non_zero_values = series[series > 0]
    if len(non_zero_values) > 0:
        return non_zero_values.min()  # Return the minimum non-zero value
    return 0  # If all distances are zero, return zero

# Group by team and game type for both away and home travel distances
away_team_travel_stats = schedule_with_coords.groupby(['Away_Team', 'Game_Type']).agg(
    total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag to count only trips where it's 1
    total_distance=('Adjusted_Away_Distance', 'sum'),
    average_distance=('Adjusted_Away_Distance', 'mean'),
    longest_trip=('Adjusted_Away_Distance', 'max'),
    shortest_trip=('Adjusted_Away_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
).reset_index()

# Home teams need separate stats for neutral games
home_team_travel_stats = schedule_with_coords[schedule_with_coords['Is_Neutral_Game']].groupby(['Home_Team', 'Game_Type']).agg(
    total_trips=('Travel_Flag', 'sum'),  # Sum the Travel_Flag for neutral games as well
    total_distance=('Adjusted_Home_Distance', 'sum'),
    average_distance=('Adjusted_Home_Distance', 'mean'),
    longest_trip=('Adjusted_Home_Distance', 'max'),
    shortest_trip=('Adjusted_Home_Distance', shortest_non_zero)  # Use custom function to get non-zero shortest trip
).reset_index()


# Group by team and game type for both away and home travel distances
# away_team_travel_stats = schedule_with_coords.groupby(['Away_Team', 'Game_Type']).agg(
#     total_trips=('Game_ID', 'count'),
#     total_distance=('Adjusted_Away_Distance', 'sum'),
#     average_distance=('Adjusted_Away_Distance', 'mean'),
#     longest_trip=('Adjusted_Away_Distance', 'max'),
#     shortest_trip=('Adjusted_Away_Distance', 'min')
# ).reset_index()

# # Home teams need separate stats for neutral games
# home_team_travel_stats = schedule_with_coords[schedule_with_coords['Is_Neutral_Game']].groupby(['Home_Team', 'Game_Type']).agg(
#     total_trips=('Game_ID', 'count'),
#     total_distance=('Adjusted_Home_Distance', 'sum'),
#     average_distance=('Adjusted_Home_Distance', 'mean'),
#     longest_trip=('Adjusted_Home_Distance', 'max'),
#     shortest_trip=('Adjusted_Home_Distance', 'min')
# ).reset_index()

# Step 9: Combine both the away and home team travel stats into a unified DataFrame
travel_stats_combined = pd.concat([away_team_travel_stats.rename(columns={'Away_Team': 'Team'}),
                                   home_team_travel_stats.rename(columns={'Home_Team': 'Team'})], axis=0)

# Step 10: Aggregate the final travel stats for each team
team_travel_stats_final = travel_stats_combined.groupby(['Team', 'Game_Type']).agg(
    total_trips=('total_trips', 'sum'),
    total_distance=('total_distance', 'sum'),
    average_distance=('average_distance', 'mean'),
    longest_trip=('longest_trip', 'max'),
    # get the shortest non-zero trip distance
    shortest_trip=('shortest_trip', lambda x: x[x > 0].min())
).reset_index()

# Step 11: Pivot the final travel stats to match the required format
team_travel_summary_final = team_travel_stats_final.pivot_table(
    index='Team',
    columns='Game_Type',
    values=['total_trips', 'total_distance', 'average_distance', 'longest_trip', 'shortest_trip'],
    fill_value=0
)

# Step 12: Flatten the columns for readability
team_travel_summary_final.columns = ['_'.join(col).strip() for col in team_travel_summary_final.columns]

# Step 13: Rename and reorder columns to the desired format
team_travel_summary_reset = team_travel_summary_final.reset_index()

new_column_names_final = {
    'total_trips_Neutral': 'N_trips',
    'total_distance_Neutral': 'N_total_distance',
    'average_distance_Neutral': 'N_AVG',
    'longest_trip_Neutral': 'N_longest',
    'shortest_trip_Neutral': 'N_shortest',
    'total_trips_On-Campus Non-Conference': 'non_con_trips',
    'total_distance_On-Campus Non-Conference': 'non_con_total_distance',
    'average_distance_On-Campus Non-Conference': 'non_con_AVG',
    'longest_trip_On-Campus Non-Conference': 'non_con_longest',
    'shortest_trip_On-Campus Non-Conference': 'non_con_shortest',
    'total_trips_On-Campus Conference': 'con_trips',
    'total_distance_On-Campus Conference': 'con_total_distance',
    'average_distance_On-Campus Conference': 'con_AVG',
    'longest_trip_On-Campus Conference': 'con_longest',
    'shortest_trip_On-Campus Conference': 'con_shortest'
}

team_travel_summary_reset.rename(columns=new_column_names_final, inplace=True)


new_column_names_final = {
    'total_trips_Neutral': 'N_trips',
    'total_distance_Neutral': 'N_total_distance',
    'average_distance_Neutral': 'N_AVG',
    'longest_trip_Neutral': 'N_longest',
    'shortest_trip_Neutral': 'N_shortest',
    'total_trips_On-Campus Non-Conference': 'non_con_trips',
    'total_distance_On-Campus Non-Conference': 'non_con_total_distance',
    'average_distance_On-Campus Non-Conference': 'non_con_AVG',
    'longest_trip_On-Campus Non-Conference': 'non_con_longest',
    'shortest_trip_On-Campus Non-Conference': 'non_con_shortest',
    'total_trips_On-Campus Conference': 'con_trips',
    'total_distance_On-Campus Conference': 'con_total_distance',
    'average_distance_On-Campus Conference': 'con_AVG',
    'longest_trip_On-Campus Conference': 'con_longest',
    'shortest_trip_On-Campus Conference': 'con_shortest'
}

team_travel_summary_reset.rename(columns=new_column_names_final, inplace=True)

# Final columns order
columns_order_final = [
    'Team', 'N_trips', 'N_total_distance', 'N_AVG', 'N_longest', 'N_shortest',
    'non_con_trips', 'non_con_total_distance', 'non_con_AVG', 'non_con_longest', 'non_con_shortest',
    'con_trips', 'con_total_distance', 'con_AVG', 'con_longest', 'con_shortest'
]

team_travel_summary_final_display = team_travel_summary_reset[columns_order_final]

# Display the final table


team_travel_summary_final_display.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_schedule_filtered['Is_Neutral_Game'] = raw_schedule_filtered.apply(is_neutral_game, axis=1, flags=neutral_flags)


Unnamed: 0,Team,N_trips,N_total_distance,N_AVG,N_longest,N_shortest,non_con_trips,non_con_total_distance,non_con_AVG,non_con_longest,non_con_shortest,con_trips,con_total_distance,con_AVG,con_longest,con_shortest
0,Air Force,1.0,600.521127,600.521127,600.521127,600.521127,2.0,1777.002969,592.334323,1764.998085,12.004884,6.0,9482.678486,790.223207,1771.362615,1316.604241
1,Alaska,1.0,2721.985269,2721.985269,2721.985269,2721.985269,12.0,28671.482794,1246.586208,3275.395086,260.422806,0.0,0.0,0.0,0.0,0.0
2,Alaska Anchorage,0.0,0.0,0.0,0.0,0.0,12.0,31712.545584,1585.627279,3401.368614,260.422806,0.0,0.0,0.0,0.0,0.0
3,American Intl,0.0,0.0,0.0,0.0,0.0,5.0,1045.971171,174.328529,565.237531,51.976219,7.0,1335.585666,102.737359,410.16097,39.21897
4,Arizona State,1.0,1.751265,1.751265,1.751265,1.751265,3.0,4393.221877,732.203646,2271.357695,548.654698,7.0,5099.497015,424.958085,1563.33154,543.441609


In [71]:
## OUTPUT THE TABLE TO CSV
## IN TEMP FOLDER
output_path = os.path.join('..', 'TEMP', 'team_travel_summary_prelim_v1.csv')

In [72]:
# # Show only Michginan teams to quality check
# team_travel_summary_final_display[team_travel_summary_final_display['Team'].str.contains('Michigan')]

## Find The Closest Other Team to Each team as well as how many times they play each other this season

In [73]:
from geopy.distance import geodesic
import numpy as np

# Helper function to calculate the distance between two points (lat, lon)
def calculate_distance(lat1, lon1, lat2, lon2):
    if pd.notnull(lat1) and pd.notnull(lon1) and pd.notnull(lat2) and pd.notnull(lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).miles
    else:
        return None  # Return None if any coordinates are missing

# Function to find the closest team to a specific team in the arena data
def find_closest_team(current_team_row, team_data):
    # Initialize variables to store the closest team and distance
    closest_team = None
    closest_distance = np.inf
    
    # Iterate over each row in the team data
    for _, row in team_data.iterrows():
        # Calculate the distance between the current team and the other team
        distance = calculate_distance(current_team_row['Latitude'], current_team_row['Longitude'], row['Latitude'], row['Longitude'])
        
        # Update the closest team if the distance is smaller
        if distance is not None and distance < closest_distance:
            closest_team = row['Team']
            closest_distance = distance
    
    return closest_team, closest_distance

# Find the closest team to each team in the arena data
closest_teams = []
closest_distances = []

# Iterate over each row in the arena_info_df to find the closest team
for _, row in arena_info_df.iterrows():
    # Exclude the current team from the comparison
    other_teams = arena_info_df[arena_info_df['Team'] != row['Team']]
    closest_team, closest_distance = find_closest_team(row, other_teams)
    
    closest_teams.append(closest_team)
    closest_distances.append(closest_distance)

# Add the closest team and distance to the arena data
arena_info_df['Closest_Team'] = closest_teams
arena_info_df['Closest_Distance'] = closest_distances

# Merge the closest team data with the team travel data
team_travel_summary_reset = team_travel_summary_reset.merge(
    arena_info_df[['Team', 'Closest_Team', 'Closest_Distance']], on='Team', how='left')

# Sort by closest distance to another team (optional, based on your preference)
# team_travel_summary_reset = team_travel_summary_reset.sort_values(by='Closest_Distance')

# Display the updated team travel summary with closest team information
# team_travel_summary_reset.head()


In [74]:
# Rename Schedule dataframe (already using schedule_with_coords)
df_schedule = schedule_with_coords.copy()

# Remove Exhibition games from schedule
df_schedule = df_schedule[df_schedule['Conference'] != 'Exhibition']

# Match the teams in the schedule with their closest team from the travel distance data
# Merging schedule data with closest team info for both home and away teams
df_schedule_merged = df_schedule.merge(
    team_travel_summary_reset[['Team', 'Closest_Team']],
    left_on='Home_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Home')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Home'}, inplace=True)

df_schedule_merged = df_schedule_merged.merge(
    team_travel_summary_reset[['Team', 'Closest_Team']],
    left_on='Away_Team',
    right_on='Team',
    how='left',
    suffixes=('', '_Closest_Away')
)

df_schedule_merged.rename(columns={'Closest_Team': 'Closest_Team_Away'}, inplace=True)

# Now, let's ensure both teams (home and away) are being compared properly
df_schedule_merged['Home_vs_Closest'] = df_schedule_merged['Away_Team'] == df_schedule_merged['Closest_Team_Home']
df_schedule_merged['Away_vs_Closest'] = df_schedule_merged['Home_Team'] == df_schedule_merged['Closest_Team_Away']

# Count how many times each team plays its closest opponent as either home or away
df_closest_match_count_home = df_schedule_merged.groupby('Home_Team').agg({
    'Home_vs_Closest': 'sum'
}).reset_index()

df_closest_match_count_away = df_schedule_merged.groupby('Away_Team').agg({
    'Away_vs_Closest': 'sum'
}).reset_index()

# Merge both home and away counts to ensure consistency for both teams
df_closest_match_total = pd.merge(
    df_closest_match_count_home, 
    df_closest_match_count_away, 
    left_on='Home_Team', 
    right_on='Away_Team', 
    how='outer'
)

# Replace missing values with 0 before summing up
df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)

# Calculate the total closest matches by summing up both columns
df_closest_match_total['Total_Closest_Matches'] = df_closest_match_total['Home_vs_Closest'] + df_closest_match_total['Away_vs_Closest']

# Rename columns for clarity and drop unneeded ones
df_closest_match_total = df_closest_match_total[['Home_Team', 'Total_Closest_Matches']].rename(columns={'Home_Team': 'Team'})

# Merge this back into the travel data
team_travel_summary_reset = team_travel_summary_reset.merge(
    df_closest_match_total[['Team', 'Total_Closest_Matches']],
    on='Team',
    how='left'
)

# Fill missing values with 0
team_travel_summary_reset['Total_Closest_Matches'].fillna(0, inplace=True)

# Display the updated travel data with the total closest matches
team_travel_summary_reset.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Home_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_closest_match_total['Away_vs_Closest'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

Unnamed: 0,Team,N_AVG,con_AVG,non_con_AVG,N_longest,con_longest,non_con_longest,N_shortest,con_shortest,non_con_shortest,N_total_distance,con_total_distance,non_con_total_distance,N_trips,con_trips,non_con_trips,Closest_Team,Closest_Distance,Total_Closest_Matches
0,Air Force,600.521127,790.223207,592.334323,600.521127,1771.362615,1764.998085,600.521127,1316.604241,12.004884,600.521127,9482.678486,1777.002969,1.0,6.0,2.0,Colorado College,12.004884,2.0
1,Alaska,2721.985269,0.0,1246.586208,2721.985269,0.0,3275.395086,2721.985269,0.0,260.422806,2721.985269,0.0,28671.482794,1.0,0.0,12.0,Alaska Anchorage,260.422806,6.0
2,Alaska Anchorage,0.0,0.0,1585.627279,0.0,0.0,3401.368614,0.0,0.0,260.422806,0.0,0.0,31712.545584,0.0,0.0,12.0,Alaska,260.422806,6.0
3,American Intl,0.0,102.737359,174.328529,0.0,410.16097,565.237531,0.0,39.21897,51.976219,0.0,1335.585666,1045.971171,0.0,7.0,5.0,Massachusetts,18.774338,1.0
4,Arizona State,1.751265,424.958085,732.203646,1.751265,1563.33154,2271.357695,1.751265,543.441609,548.654698,1.751265,5099.497015,4393.221877,1.0,7.0,3.0,Colorado College,543.441609,4.0
