In [6]:
import pandas as pd
import numpy as np
import os

# Read in the data
df = pd.read_csv('../data/cfb_scrape_transformed.csv')

# game_data = df
# # change column names to lowercase
# game_data.columns = map(str.lower, game_data.columns)


  df = pd.read_csv('../data/cfb_scrape_transformed.csv')


In [7]:
# Adjusting the transformation code to align with the current column names and structure

# Calculate the point difference for each game
df['point_diff'] = df['winner_pts'] - df['loser_pts']

# Extracting data for winners and losers
winner_data = df[['season', 'winner', 'winner_conference', 'loc_ind', 'game_type', 'point_diff']].copy()
winner_data.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff']
winner_data['result'] = 'win'
loser_data = df[['season', 'loser', 'loser_conference', 'loc_ind', 'game_type', 'point_diff']].copy()
loser_data.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff']
loser_data['result'] = 'loss'
loser_data['point_diff'] = -loser_data['point_diff']
game_data = pd.concat([winner_data, loser_data], axis=0).reset_index(drop=True)

# Handling ties (where point_diff == 0)
ties = df[df['point_diff'] == 0][['season', 'winner', 'winner_conference', 'loc_ind', 'game_type', 'point_diff']].copy()
ties.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff']
ties['result'] = 'tie'
game_data = pd.concat([game_data, ties], axis=0).reset_index(drop=True)

# Aggregated transformations
team_conference = game_data[['season', 'team', 'conference']].drop_duplicates()
overall_stats = game_data.groupby(['season', 'team', 'result']).size().unstack(fill_value=0)
overall_stats.columns = ['overall_' + col for col in overall_stats.columns]
overall_stats.reset_index(inplace=True)
home_stats = game_data[game_data['location'] == ''].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)
home_stats.columns = ['home_' + col for col in home_stats.columns]
away_stats = game_data[game_data['location'] == '@'].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)
away_stats.columns = ['away_' + col for col in away_stats.columns]
location_stats = pd.concat([home_stats, away_stats], axis=1).fillna(0).reset_index()
non_conference_stats = game_data[game_data['game_type'] == 'Non-Conference'].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)
non_conference_stats.columns = ['non_conference_' + col for col in non_conference_stats.columns]
game_type_stats = non_conference_stats.fillna(0).reset_index()
blowout_wins = game_data[game_data['point_diff'] >= 21].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)['win'].reset_index()
blowout_wins.columns = ['season', 'team', 'blowout_wins']
blowout_losses = game_data[game_data['point_diff'] <= -21].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)['loss'].reset_index()
blowout_losses.columns = ['season', 'team', 'blowout_losses']
close_wins = game_data[(game_data['point_diff'] <= 7) & (game_data['point_diff'] > 0)].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)['win'].reset_index()
close_wins.columns = ['season', 'team', 'close_wins']
close_losses = game_data[(game_data['point_diff'] >= -7) & (game_data['point_diff'] < 0)].groupby(['season', 'team', 'result']).size().unstack(fill_value=0)['loss'].reset_index()
close_losses.columns = ['season', 'team', 'close_losses']
point_margin_stats = pd.merge(blowout_wins, blowout_losses, on=['season', 'team'], how='outer')
point_margin_stats = pd.merge(point_margin_stats, close_wins, on=['season', 'team'], how='outer')
point_margin_stats = pd.merge(point_margin_stats, close_losses, on=['season', 'team'], how='outer')
point_margin_stats = point_margin_stats.fillna(0)

# Merging all dataframes to get the final aggregated dataframe with conference info
final_df = pd.merge(team_conference, overall_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, location_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, game_type_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, point_margin_stats, on=['season', 'team'], how='outer')
final_df = final_df.fillna(0)

final_df.head()


Unnamed: 0,season,team,conference,overall_loss,overall_tie,overall_win,away_loss,away_tie,away_win,non_conference_loss,non_conference_tie,non_conference_win,blowout_wins,blowout_losses,close_wins,close_losses
0,1869,Rutgers,Independent/Unknown,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1869,Princeton,Independent/Unknown,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1870,Rutgers,Independent/Unknown,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
3,1870,Princeton,Independent/Unknown,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1872,Rutgers,Independent/Unknown,1.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0


In [8]:
final_df.sample(10)

# save the final dataframe to a csv file for use in Tableau
final_df.to_csv('../data/cfb_yearly_summary_stats.csv', index=False)

In [9]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25381 entries, 0 to 25380
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   season               25381 non-null  int64  
 1   team                 25381 non-null  object 
 2   conference           25381 non-null  object 
 3   overall_loss         25381 non-null  float64
 4   overall_tie          25381 non-null  float64
 5   overall_win          25381 non-null  float64
 6   away_loss            25381 non-null  float64
 7   away_tie             25381 non-null  float64
 8   away_win             25381 non-null  float64
 9   non_conference_loss  25381 non-null  float64
 10  non_conference_tie   25381 non-null  float64
 11  non_conference_win   25381 non-null  float64
 12  blowout_wins         25381 non-null  float64
 13  blowout_losses       25381 non-null  float64
 14  close_wins           25381 non-null  float64
 15  close_losses         25381 non-null 

In [None]:
df.head(20)

In [None]:
cfb_data = df

# Determine wins, losses, and ties for each team in each season
# Wins
win_counts = cfb_data.groupby(['winner', 'season']).size().reset_index(name='Win')
# Losses
loss_counts = cfb_data.groupby(['loser', 'season']).size().reset_index(name='Loss')
# Merging wins and losses
season_summary = pd.merge(win_counts, loss_counts, left_on=['winner', 'season'], right_on=['loser', 'season'], how='outer')
season_summary['Team'] = season_summary['winner'].combine_first(season_summary['loser'])
season_summary.drop(columns=['winner', 'loser'], inplace=True)
# Filling NaN values with 0
season_summary.fillna(0, inplace=True)

season_summary.head()


In [None]:
# Calculate the total number of games played
season_summary['GamesPlayed'] = season_summary['Win'] + season_summary['Loss']

# Calculate win percentages
season_summary['Win_Pct'] = season_summary['Win'] / season_summary['GamesPlayed']

# Calculate total points scored and allowed by each team in each season
points_scored = cfb_data.groupby(['winner', 'season'])['winner_pts'].sum().reset_index()
points_allowed = cfb_data.groupby(['loser', 'season'])['loser_pts'].sum().reset_index()

# Merging points scored and points allowed
season_summary = pd.merge(season_summary, points_scored, left_on=['Team', 'season'], right_on=['winner', 'season'], how='left')
season_summary = pd.merge(season_summary, points_allowed, left_on=['Team', 'season'], right_on=['loser', 'season'], how='left')
season_summary.drop(columns=['winner', 'loser'], inplace=True)

# Rename columns
season_summary.rename(columns={'winner_pts': 'PointsScored', 'loser_pts': 'PointsAllowed'}, inplace=True)

# Fill NaN values with 0
season_summary.fillna(0, inplace=True)

# Calculate average points scored and allowed
season_summary['AVGPointsScored'] = season_summary['PointsScored'] / season_summary['GamesPlayed']
season_summary['AVGPointsAllowed'] = season_summary['PointsAllowed'] / season_summary['GamesPlayed']

# Calculate point differentials
season_summary['PointDiff'] = season_summary['PointsScored'] - season_summary['PointsAllowed']
season_summary['AVGPointDiff'] = season_summary['PointDiff'] / season_summary['GamesPlayed']

season_summary.head()


In [None]:
season_summary.sample(20)