In [3]:
# Note  book to explore and visualize the results tables
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import tqdm

# load results from 1901 to 2001 season
part1_df = pd.read_csv('..//data//year_over_year/results_table_1901_2001_new.csv')

## load results from 2002 to 2022 season
part2_df = pd.read_csv('..//data/year_over_year/results_table_2002_2022_new.csv')



print(len(part1_df))
print(len(part2_df))

# Recreate the Game_ID column in part1 to match the format in part2
## Example '2002-10-01_Windsor_Bowling Green'

part1_df['Game_ID'] = part1_df['Date'].astype(str) + '_' + part1_df['Home_Team'] + '_' + part1_df['Away_Team']



part1_df.head(1)

29705
24022


Unnamed: 0,Date,Conference,Home_Team,Home_Team_Link,Home_Score,Away_Team,Away_Team_Link,Away_Score,OT,Game_Notes,Game_ID,Day
0,1902-01-15,Intercollegiate,Princeton,/reports/team/Princeton/45,0.0,Yale,/reports/team/Yale/59,7.0,,"at St. Nicholas Rink, New York",1902-01-15_Princeton_Yale,Wednesday


In [4]:


drop_cols = ['Metrics_Link', 'Box_Link']
# drop the columns that are not needed
part2_df.drop(drop_cols, axis=1, inplace=True)

part2_df.head(1)

Unnamed: 0,Date,Conference,Game_Notes,Home_Team,Home_Team_Link,Home_Score,Away_Team,Away_Team_Link,Away_Score,OT,Day,Game_ID
0,2002-10-01,Exhibition,,Windsor,,2.0,Bowling Green,/reports/team/Bowling-Green/11,5.0,,Tuesday,2002-10-01_Windsor_Bowling Green


In [5]:
## Combine the two dataframes into a single dataframe
df = pd.concat([part1_df, part2_df])

## aBOUT 75 games have null values in the score columns - most (60) are from end of cancelled 2019-2020 season
## drop these rows
df.dropna(subset=['Home_Score', 'Away_Score'], inplace=True)

print(len(df))

df.head(1)

df_orig = df.copy()

# Save the resulting dataframe to a csv file - This is the cleaned all_time results table
# df.to_csv('..//data//results_table_all_time.csv', index=False)

53652


In [6]:
# value counts for the conference column
df['Conference'].value_counts()

# Get the top 10% of the conferences
top_conf = df['Conference'].value_counts().head(30).index.tolist()

# # Print a report on the 50 conferences with the most games played
# for conf in top_conf:
#     print(conf)
#     print(df[df['Conference'] == conf]['Home_Score'].describe())
#     print('\n')

# Print list of top conferences
print(top_conf)

# Unique values for the conference column
len(df['Conference'].unique())

# Clean up the conference column - use list
real_conferences = ['WCHA', 'ECAC', 'CCHA', 'Hockey East', 'Atlantic Hockey',
                    'NCHC', 'Big Ten','MAAC', 'CHA', 'Ivy League',
                    'Intercollegiate', 'Northeast League', 'Non-Conference',
                    'Intercollegiate']

## Create a copy of the original dataframe
masked_df = df.copy()

# Drop rows if the conference is not in the list of real conferences
masked_df = masked_df[masked_df['Conference'].isin(real_conferences)]





['Non-Conference', 'WCHA', 'ECAC', 'CCHA', 'Hockey East', 'Atlantic Hockey', 'Exhibition', 'Non-Conference v. D3', 'NCHC', 'Big Ten', 'ECAC Tournament', 'MAAC', 'CHA', 'CCHA Tournament', 'HEA Tournament', 'Ivy League', 'Non-Conf v. non-vars', 'AHA Tournament', 'Non-Conf v. Canadian', 'Beanpot (at Boston Garden, Boston)', 'Non-Conf v. NAIA', 'NCAA Tournament', 'CCHA Tournament (at Joe Louis Arena, Detroit)', 'Great Lakes Invitational (at Joe Louis Arena, Detroit)', 'ECAC Tournament (at Boston Garden, Boston)', 'NEIHL', 'WCHA Tournament (at Ralph Engelstad Arena, Grand Forks, N.D.)', 'Intercollegiate', 'Northeast League', 'Mariucci Classic (at Mariucci Arena, Minneapolis)']


## Create and save a table with the conference assignments for each team each year



In [7]:

# Pre-Step - Create a Season_Year column (season lasts from sSept to June - Season_Year is the year the season starts)
def get_season_year(date):
    year = int(date.split('-')[0])
    month = int(date.split('-')[1])
    if month < 9:
        year -= 1
    return year

# Pre Process -  Create a Season_Year column
masked_df['Season_Year'] = masked_df['Date'].apply(get_season_year)

# Full block of code for the revised approach

# Step 1: Filter out the relevant columns
filtered_df = masked_df[['Season_Year', 'Conference', 'Home_Team', 'Away_Team']]

# Step 2: Create a set of unique pairs of Season_Year and Team for both home and away games
home_teams = filtered_df[['Season_Year', 'Home_Team', 'Conference']].rename(columns={'Home_Team': 'Team'})
away_teams = filtered_df[['Season_Year', 'Away_Team', 'Conference']].rename(columns={'Away_Team': 'Team'})
all_teams = pd.concat([home_teams, away_teams])

# Step 3: Group the data by Season_Year and Team, then count the occurrences of each Conference
grouped = all_teams.groupby(['Season_Year', 'Team', 'Conference']).size().reset_index(name='Count')

# Step 4: For each group, pick the conference that appears most frequently (ignoring 'Non-Conference' if other options are available)
def choose_most_frequent(group):
    group = group.sort_values('Count', ascending=False)
    if 'Non-Conference' in group['Conference'].values:
        group = group[group['Conference'] != 'Non-Conference']
        if group.empty:
            return 'None'
    return group.iloc[0]['Conference']

most_frequent_conference = grouped.groupby(['Season_Year', 'Team']).apply(choose_most_frequent).reset_index(name='Conference')

# Save the updated DataFrame to a new CSV file
output_path_final_automatic = '../data/team_conference_by_year.csv'
most_frequent_conference.to_csv(output_path_final_automatic, index=False)

output_path_final_automatic


'../data/team_conference_by_year.csv'

In [8]:

## Change the data type of the Score columns to int
# drop any rows with null values in the score columns
df.dropna(subset=['Home_Score', 'Away_Score'], inplace=True)
int_cols = ['Home_Score', 'Away_Score']
df[int_cols] = df[int_cols].astype(int)

## Store Date as a datetime object
df['Date'] = pd.to_datetime(df['Date'])
## Store the season_year in a separate column as an int
## A season last from September to July of the following year
## Example: 2019-2020 season is stored as 2019

# simple function to assign the season year to each game
def season_year(date):
    if date.month >= 9:
        return date.year
    else:
        return date.year - 1
    
# apply the function to the date column and store the results in a new column
df['Season_Year'] = df['Date'].apply(season_year)
# store as an int
df['Season_Year'] = df['Season_Year'].astype(int)


# Save the resulting dataframe to a csv file - This is the cleaned all_time results table
df.to_csv('..//data//results_table_all_time.csv', index=False)

df.dtypes

## value counts for the conference column
df['Conference'].value_counts()

# are there any exhibitions games in the data set? 'Exhibition' in the COnference column
# df[df['Conference'] == 'Exhibition']

# drop all rows with 'Exhibition' in the Conference column
df = df[df['Conference'] != 'Exhibition']

# Create full team list
team_list = list(df['Home_Team'].unique())
team_list.extend(list(df['Away_Team'].unique()))
team_list = list(set(team_list))

full_team_list = sorted(team_list)

# print(len(team_list))

print ('Total number of teams in the data set: ', len(full_team_list))
print ('Total number of games in the data set: ', len(df))



Total number of teams in the data set:  613
Total number of games in the data set:  51768


## Transform

### Take the raw results table to create a dataframe with the year by year breakdown



In [9]:
from tqdm import tqdm

# Helper function to calculate wins, loses, ties, winning percentage, available points, points, and point percentage goals scored per game, and goals allowed per game
def calculate_stats(df, team_column, score_column, opponent_score_column):
    wins = len(df[df[score_column] > df[opponent_score_column]])
    loses = len(df[df[score_column] < df[opponent_score_column]])
    ties = len(df[df[score_column] == df[opponent_score_column]])
    total_games = wins + loses + ties
    
    # New code to calculate points
    # available_points = total_games * 2
    # points = wins * 2 + ties
    # pt_pct = points / available_points if available_points > 0 else 0
    ##################


    win_pct = wins / total_games if total_games > 0 else 0
    goals_scored_per_game = df[score_column].sum() / total_games if total_games > 0 else 0
    goals_allowed_per_game = df[opponent_score_column].sum() / total_games if total_games > 0 else 0
    
    return wins, loses, ties, win_pct, goals_scored_per_game, goals_allowed_per_game


# Integrate tqdm into the function for progress tracking
def calculate_team_stats_by_season_with_tqdm(df, unique_teams):
    summary_list = []
    unique_seasons = sorted(df['Season_Year'].unique())
    
    # Wrap the outer loop with tqdm for progress tracking
    for season in tqdm(unique_seasons, desc="Processing seasons"):
        season_data = df[df['Season_Year'] == season]
        
        for team in unique_teams:
            team_data_home = season_data[season_data['Home_Team'] == team]
            team_data_away = season_data[season_data['Away_Team'] == team]
            
            stats = {'Season_Year': season, 'Team': team}
            
            # Calculate the "Home" and "Away" stats first
            for label, team_data_label, team_column, score_column, opponent_score_column in [
                ('Home', team_data_home, 'Home_Team', 'Home_Score', 'Away_Score'),
                ('Away', team_data_away, 'Away_Team', 'Away_Score', 'Home_Score')
            ]:
                stats[f'{label}_Wins'], stats[f'{label}_Loses'], stats[f'{label}_Ties'], stats[f'{label}_Win_Pct'], stats[f'{label}_Goals_Scored_Per_Game'], stats[f'{label}_Goals_Allowed_Per_Game'] = calculate_stats(
                    team_data_label, team_column, score_column, opponent_score_column
                )
            
            # Sum the "Home" and "Away" counts for "Overall" stats
            stats['Overall_Wins'] = stats['Home_Wins'] + stats['Away_Wins']
            stats['Overall_Loses'] = stats['Home_Loses'] + stats['Away_Loses']
            stats['Overall_Ties'] = stats['Home_Ties'] + stats['Away_Ties']
            ##################
            ## ADDING POINT SYSTEM
            stats['Overall_Points'] = stats['Overall_Wins'] * 2 + stats['Overall_Ties']
            stats['Possible_Points'] = (stats['Overall_Wins'] + stats['Overall_Loses'] + stats['Overall_Ties']) * 2
            stats['Point_Pct'] = stats['Overall_Points'] / stats['Possible_Points'] if stats['Possible_Points'] > 0 else 0
            ##################
            total_games = stats['Overall_Wins'] + stats['Overall_Loses'] + stats['Overall_Ties']
            stats['Overall_Win_Pct'] = stats['Overall_Wins'] / total_games if total_games > 0 else 0
            stats['Overall_Goals_Scored_Per_Game'] = (team_data_home['Home_Score'].sum() + team_data_away['Away_Score'].sum()) / total_games if total_games > 0 else 0
            stats['Overall_Goals_Allowed_Per_Game'] = (team_data_home['Away_Score'].sum() + team_data_away['Home_Score'].sum()) / total_games if total_games > 0 else 0
            
            # Calculate the other scenarios
            for label, team_data_label, team_column, score_column, opponent_score_column in [
                ('Non_Conference', pd.concat([team_data_home[team_data_home['Conference'] == 'Non-Conference'], team_data_away[team_data_away['Conference'] == 'Non-Conference']]), 'Home_Team', 'Home_Score', 'Away_Score'),
                ('Conference', pd.concat([team_data_home[team_data_home['Conference'] != 'Non-Conference'], team_data_away[team_data_away['Conference'] != 'Non-Conference']]), 'Home_Team', 'Home_Score', 'Away_Score'),
                ('Overtime', pd.concat([team_data_home[team_data_home['Game_Notes'].str.contains('ot|OT', na=False)], team_data_away[team_data_away['Game_Notes'].str.contains('ot|OT', na=False)]]), 'Home_Team', 'Home_Score', 'Away_Score')
            ]:
                stats[f'{label}_Wins'], stats[f'{label}_Loses'], stats[f'{label}_Ties'], stats[f'{label}_Win_Pct'], stats[f'{label}_Goals_Scored_Per_Game'], stats[f'{label}_Goals_Allowed_Per_Game'] = calculate_stats(
                    team_data_label, team_column, score_column, opponent_score_column
                )
                
            summary_list.append(stats)
    
    summary_df = pd.DataFrame(summary_list)
    return summary_df



new_df = df.copy()
# Example code to run the function with tqdm on the full dataset


# Uncomment the following line to run the function
complete_summary_df_with_tqdm = calculate_team_stats_by_season_with_tqdm(new_df, full_team_list)




Processing seasons: 100%|██████████| 122/122 [08:41<00:00,  4.27s/it]


# NEW AdjustMEnt

## trying to calculate the points for the season 

want to calculate for each scenereo

In [10]:
complete_summary_df_with_tqdm.head(1)   

Unnamed: 0,Season_Year,Team,Home_Wins,Home_Loses,Home_Ties,Home_Win_Pct,Home_Goals_Scored_Per_Game,Home_Goals_Allowed_Per_Game,Away_Wins,Away_Loses,...,Conference_Ties,Conference_Win_Pct,Conference_Goals_Scored_Per_Game,Conference_Goals_Allowed_Per_Game,Overtime_Wins,Overtime_Loses,Overtime_Ties,Overtime_Win_Pct,Overtime_Goals_Scored_Per_Game,Overtime_Goals_Allowed_Per_Game
0,1901,10,0,0,0,0.0,0.0,0.0,0,0,...,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0


In [11]:
### Save the results to a csv file
complete_summary_df_with_tqdm.to_csv('..//data//tableau//team_stats_by_season_all_time.csv', index=False)

# Load the results from the csv file
# complete_summary_df_with_tqdm = pd.read_csv('..//data//team_stats_by_season_all_time.csv')


In [12]:
## Clean the data by dropping unnecessary / empty rows. 

## If a team has nothing but 0s in the stats columns, drop the row

working_df = complete_summary_df_with_tqdm

working_df['Total_Games'] = working_df['Overall_Wins'] + working_df['Overall_Loses'] + working_df['Overall_Ties']

working_df['Total_Games'].value_counts()

print(len(working_df))
# If a row has no total games, drop the row
working_df = working_df[working_df['Total_Games'] > 0]

print(len(working_df))



## Save the results to a csv file CLEANED
working_df.to_csv('..//data//VER2_CLEANED_team_stats_by_season_all_time.csv', index=False)





74786
6360


In [13]:
## Create a dataframe of the location of each school using google places api
# This will allow us to do some cool map visualizations

# get a unique list of schools in the dataframe from both the home and away columns
schools = df['Home_Team'].unique().tolist() + df['Away_Team'].unique().tolist()
# remove any duplicates in th list
schools = list(set(schools))
len(schools)

# filter list down to team names that are relevant to the analysis - count the number of games played by each team
team_games = df['Home_Team'].value_counts() + df['Away_Team'].value_counts()

# sort the list by the number of games played
team_games.sort_values(ascending=False, inplace=True)

# # create a histogram of the number of games played by each team
# fig, ax = plt.subplots(figsize=(12, 8))
# ax.hist(team_games, bins=50)

## basic stats on the number of games played by each team
team_games.describe()

# create a list of the 100 teams that have played the most games
team_list = team_games[:100].index.tolist()
# create a list of teams that have played at least 500 games
# team_list = team_games[team_games >= 50].index.tolist()
len(team_list)
## print a sample of the schools
# team_list[10:]

100

In [14]:
### Create a dictionary of the location of each school using google places api
import sys

import googlemaps
from datetime import datetime
sys.path.append("../TEMP") # add TEMP folder to path to import api key

from keys import API_KEY # import the api key

# print(API_KEY)


ModuleNotFoundError: No module named 'googlemaps'

In [None]:
####### BLOCK THAT LOOKS UP TEAM LOCATIONS - TURNED OFF TO REDUCE THE NUMBER OF API CALLS

### THE RESULTS OF THE API CALLS ARE SAVED IN THE FILE THAT IS LOADED BELOW

# import requests
# import json
# import pandas as pd  # Importing pandas for DataFrame
# from tqdm import tqdm  # Importing tqdm for the progress bar

# # team_list = ["Harvard", "Yale", "MIT"]  # Replace with your actual list
# additional_term = "college hockey"

# def fetch_places(school, term, api_key):
#     query = f"{school} {term}"
#     url = f"https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input={query}&inputtype=textquery&fields=formatted_address,name,geometry&key={api_key}"
#     response = requests.get(url)
#     if response.status_code == 200:
#         return response.json()
#     else:
#         return None

# # List to store results
# coordinates_list = []

# # Adding a progress bar with tqdm
# for school in tqdm(team_list, desc="Fetching Places"):
#     results = fetch_places(school, additional_term, API_KEY)
#     if results and results['candidates']:
#         best_result = results['candidates'][0]  # Assuming the first result is the best
#         lat = best_result['geometry']['location']['lat']
#         lng = best_result['geometry']['location']['lng']
#         coordinates_list.append({
#             'School': school,
#             'Latitude': lat,
#             'Longitude': lng
#         })

# # Create DataFrame
# places_df = pd.DataFrame(coordinates_list)

# # Save DataFrame to CSV
# places_df.to_csv("school_coordinates.csv", index=False)

# # Show the DataFrame (Optional)
# print(places_df)




In [None]:

# save the team locations table to a csv file in the data folder
# places_df.to_csv('..//data//top100_team_locations.csv', index=False)

## Load the team locations table from the csv file from previous step
places_df = pd.read_csv('..//data//top100_team_locations.csv')

In [None]:
print(len(df))

df.sample(10)

In [None]:
## Save teh complete summary table to a csv file for review
# complete_summary_df.to_csv('..//data//VER3_complete_summary_table.csv', index=False)

In [None]:
filtered_data_no_exhibition = df.copy()

In [None]:
# Initialize dictionaries to hold the counts
overall_record = defaultdict(lambda: {'Wins': 0, 'Losses': 0, 'Ties': 0, 'OT': 0})
home_record = defaultdict(lambda: {'Wins': 0, 'Losses': 0, 'Ties': 0, 'OT': 0})
away_record = defaultdict(lambda: {'Wins': 0, 'Losses': 0, 'Ties': 0, 'OT': 0})

# Iterate through each row to populate the counts
for idx, row in filtered_data_no_exhibition.iterrows():
    home_team, home_score = row['Home_Team'], row['Home_Score']
    away_team, away_score = row['Away_Team'], row['Away_Score']
    is_ot = row['OT']

    # Update Overall Records
    if home_score > away_score:
        overall_record[home_team]['Wins'] += 1
        overall_record[away_team]['Losses'] += 1
    elif home_score < away_score:
        overall_record[home_team]['Losses'] += 1
        overall_record[away_team]['Wins'] += 1
    else:
        overall_record[home_team]['Ties'] += 1
        overall_record[away_team]['Ties'] += 1

    # Update Home Records
    if home_score > away_score:
        home_record[home_team]['Wins'] += 1
        away_record[away_team]['Losses'] += 1
    elif home_score < away_score:
        home_record[home_team]['Losses'] += 1
        away_record[away_team]['Wins'] += 1
    else:
        home_record[home_team]['Ties'] += 1
        away_record[away_team]['Ties'] += 1

    # Update OT Counts
    if pd.notna(is_ot):
        overall_record[home_team]['OT'] += 1
        overall_record[away_team]['OT'] += 1
        home_record[home_team]['OT'] += 1
        away_record[away_team]['OT'] += 1

# Convert dictionaries to DataFrames
overall_df = pd.DataFrame.from_dict(overall_record, orient='index').reset_index().rename(columns={'index': 'Team'}).add_prefix('Overall_')
home_df = pd.DataFrame.from_dict(home_record, orient='index').reset_index().rename(columns={'index': 'Team'}).add_prefix('Home_')
away_df = pd.DataFrame.from_dict(away_record, orient='index').reset_index().rename(columns={'index': 'Team'}).add_prefix('Away_')

# Add 'Total_Games' and 'Win_Percent' to each DataFrame
for df, prefix in [(overall_df, 'Overall'), (home_df, 'Home'), (away_df, 'Away')]:
    df[f'{prefix}_Total_Games'] = df[[f'{prefix}_Wins', f'{prefix}_Losses', f'{prefix}_Ties']].sum(axis=1)
    df[f'{prefix}_Win_Percent'] = (df[f'{prefix}_Wins'] / df[f'{prefix}_Total_Games']) * 100

# Rename the 'Team' columns for merging
overall_df.rename(columns={'Overall_Team': 'Team'}, inplace=True)
home_df.rename(columns={'Home_Team': 'Team'}, inplace=True)
away_df.rename(columns={'Away_Team': 'Team'}, inplace=True)

# Merge DataFrames for a comprehensive view
final_df = overall_df.merge(home_df, on='Team').merge(away_df, on='Team')

# Fliter the final_df to only include teams in the top 100
final_df = final_df[final_df['Team'].isin(team_list)]

# Sort by overall winning percentage
final_df_sorted = final_df.sort_values('Overall_Win_Percent', ascending=False)


final_df_sorted.head()


In [None]:
## drop teams from data if they have no wins at all
final_df_sorted = final_df_sorted[final_df_sorted['Overall_Wins'] > 0]

## Plot the tops and bottom teams in overall win percentage and label the teams with their names and win lose records
# Set the figure size
plt.figure(figsize=(20, 10))

# Plot the top 10 teams
plt.bar(final_df_sorted['Team'].head(10), final_df_sorted['Overall_Win_Percent'].head(10), color='green')

# Plot the bottom 10 teams
plt.bar(final_df_sorted['Team'].tail(10), final_df_sorted['Overall_Win_Percent'].tail(10), color='red')

# Add labels and title
plt.ylabel('Win Percentage')
plt.xlabel('Team')
plt.title('Top 10 and Bottom 10 Teams by Overall Win Percentage')

# Add grid lines
plt.grid(axis='y')

# show the plot
plt.show()

In [None]:
## Create a histogram of the overall win percentage
# Set the figure size
plt.figure(figsize=(20, 10))

# Plot the histogram
plt.hist(final_df_sorted['Overall_Win_Percent'], bins=20, color='blue')

# Add labels and title
plt.ylabel('Frequency')
plt.xlabel('Win Percentage')
plt.title('Histogram of Overall Win Percentage')

# Add grid lines
plt.grid(axis='y')

# show the plot
plt.show()

In [None]:
##### CODE TO MAKE AND PLOT THE A ROLLING $ YEAR AVERAGE OF THE WIN PERCENTAGE
# Convert the 'Date' column to datetime format and extract the year
df_orig['Year'] = pd.to_datetime(df_orig['Date']).dt.year

# Filter out exhibition games
filtered_df = df_orig[df_orig['Conference'] != 'Exhibition']

# Initialize a dictionary to hold annual win counts for each team
annual_wins = defaultdict(lambda: defaultdict(int))

# Initialize a dictionary to hold annual game counts for each team
annual_games = defaultdict(lambda: defaultdict(int))

# Iterate through each row to populate the counts
for idx, row in filtered_df.iterrows():
    home_team, home_score, away_team, away_score, year = row['Home_Team'], row['Home_Score'], row['Away_Team'], row['Away_Score'], row['Year']
    
    # Update annual game counts
    annual_games[home_team][year] += 1
    annual_games[away_team][year] += 1

    # Update annual win counts
    if home_score > away_score:
        annual_wins[home_team][year] += 1
    elif home_score < away_score:
        annual_wins[away_team][year] += 1

# Calculate annual win percentages and create a DataFrame
team_years = []
win_percentages = []

for team, years in annual_games.items():
    for year, total_games in years.items():
        win_count = annual_wins[team].get(year, 0)
        win_percent = (win_count / total_games) * 100
        team_years.append((team, year))
        win_percentages.append(win_percent)

# Create a DataFrame for the annual win percentages
annual_win_df = pd.DataFrame(team_years, columns=['Team', 'Year'])
annual_win_df['Win_Percentage'] = win_percentages

# Sort the DataFrame by Team and Year
annual_win_df.sort_values(['Team', 'Year'], inplace=True)

# Display the first few rows to verify the calculations
annual_win_df.head()


In [None]:
## Calculate and add the rolling 4 year average to the DataFrame

# Calculate the 4-year rolling average of win percentages for each team
annual_win_df['4Yr_Rolling_Avg'] = annual_win_df.groupby('Team')['Win_Percentage'].rolling(window=4).mean().reset_index(level=0, drop=True)

# Display the first few rows to verify the calculations
# annual_win_df.head(10)


In [None]:
## Create a plot of the rolling average for big ten teams

import matplotlib.pyplot as plt

# Define team colors for the plot
team_colors = {
    'Michigan State': 'green',
    'Michigan': 'blue',
    'Minnesota': 'maroon',
    'Penn State': 'navy',
    'Wisconsin': 'red',
    'Notre Dame': 'gold',
    'Ohio State': 'gray'
}

# Filter the DataFrame to only include the teams of interest
teams_of_interest = ['Michigan State', 'Michigan', 'Minnesota', 'Penn State', 'Wisconsin', 'Notre Dame', 'Ohio State']
filtered_annual_win_df = annual_win_df[annual_win_df['Team'].isin(teams_of_interest)]

# Plot the 4-year rolling averages
plt.figure(figsize=(14, 8))

for team, color in team_colors.items():
    team_data = filtered_annual_win_df[filtered_annual_win_df['Team'] == team]
    plt.plot(team_data['Year'], team_data['4Yr_Rolling_Avg'], label=team, color=color)

plt.title('4-Year Rolling Average of Win Percentage')
plt.xlabel('Year')
plt.ylabel('4-Year Rolling Average Win %')
plt.legend()
plt.grid(True)
plt.show()
