In [6]:
import pandas as pd
import numpy as np
import os

# Read in the data
df = pd.read_csv('../data/cfb_scrape_transformed.csv')

# game_data = df
# # change column names to lowercase
# game_data.columns = map(str.lower, game_data.columns)


  df = pd.read_csv('../data/cfb_scrape_transformed.csv')


In [7]:
import pandas as pd

# Load data and initial transformations

df['point_diff'] = df['winner_pts'] - df['loser_pts']

# Extract data for winners and losers
winner_data = df[['season', 'winner', 'winner_conference', 'loc_ind', 'game_type', 'point_diff', 'winner_pts', 'loser_pts']].copy()
winner_data.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff', 'team_pts', 'opponent_pts']
winner_data['result'] = 'win'

loser_data = df[['season', 'loser', 'loser_conference', 'loc_ind', 'game_type', 'point_diff', 'loser_pts', 'winner_pts']].copy()
loser_data.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff', 'team_pts', 'opponent_pts']
loser_data['result'] = 'loss'
loser_data['point_diff'] = -loser_data['point_diff']

game_data = pd.concat([winner_data, loser_data], axis=0).reset_index(drop=True)

# Handling ties
ties = df[df['point_diff'] == 0][['season', 'winner', 'winner_conference', 'loc_ind', 'game_type', 'point_diff', 'winner_pts', 'loser_pts']].copy()
ties.columns = ['season', 'team', 'conference', 'location', 'game_type', 'point_diff', 'team_pts', 'opponent_pts']
ties['result'] = 'tie'
game_data = pd.concat([game_data, ties], axis=0).reset_index(drop=True)

# Define function to calculate stats
def calculate_stats(data, prefix=''):
    stats = data.groupby(['season', 'team', 'result']).size().unstack(fill_value=0)
    stats.columns = [prefix + col for col in stats.columns]
    stats.reset_index(inplace=True)
    
    # Calculate win percentage
    total_games = stats[[col for col in stats.columns if col.endswith(('win', 'loss', 'tie'))]].sum(axis=1)
    stats[prefix+'win_percentage'] = (stats.get(prefix+'win', 0) / total_games) * 100
    
    return stats

# Calculate stats for various situations
overall_stats = calculate_stats(game_data, 'overall_')
home_stats = calculate_stats(game_data[game_data['location'].isna()], 'home_')
away_stats = calculate_stats(game_data[game_data['location'] == '@'], 'away_')
conference_stats = calculate_stats(game_data[game_data['conference'] == game_data['game_type']], 'conference_')
non_conference_stats = calculate_stats(game_data[game_data['conference'] != game_data['game_type']], 'non_conference_')
blowout_stats = calculate_stats(game_data[game_data['point_diff'].abs() >= 21], 'blowout_')
close_stats = calculate_stats(game_data[game_data['point_diff'].abs() <= 7], 'close_')

# Merging all stats to form the final dataframe
team_conference = game_data[['season', 'team', 'conference']].drop_duplicates()
final_df = pd.merge(team_conference, overall_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, home_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, away_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, conference_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, non_conference_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, blowout_stats, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, close_stats, on=['season', 'team'], how='outer')

# Filling NaN values with 0
final_df = final_df.fillna(0)

final_df.head()


Unnamed: 0,season,team,conference,overall_loss,overall_tie,overall_win,overall_win_percentage,home_loss,home_win,home_win_percentage,...,non_conference_tie,non_conference_win,non_conference_win_percentage,blowout_loss,blowout_win,blowout_win_percentage,close_loss,close_tie,close_win,close_win_percentage
0,1869,Rutgers,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,0.0,1.0,50.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0
1,1869,Princeton,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,0.0,1.0,50.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1870,Rutgers,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,0.0,1.0,50.0,0.0,0.0,0.0,1.0,0.0,1.0,50.0
3,1870,Princeton,Independent/Unknown,0.0,0.0,1.0,100.0,0.0,1.0,100.0,...,0.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0,1.0,100.0
4,1872,Rutgers,Independent/Unknown,1.0,1.0,2.0,50.0,1.0,1.0,50.0,...,1.0,2.0,50.0,0.0,0.0,0.0,1.0,1.0,2.0,50.0


In [8]:
# Define function to calculate points metrics
def calculate_points(data, prefix=''):
    total_points_scored = data.groupby(['season', 'team'])['team_pts'].sum().reset_index()
    total_points_scored.columns = ['season', 'team', prefix + 'total_pts_scored']
    
    total_points_allowed = data.groupby(['season', 'team'])['opponent_pts'].sum().reset_index()
    total_points_allowed.columns = ['season', 'team', prefix + 'total_pts_allowed']
    
    avg_points_scored = data.groupby(['season', 'team'])['team_pts'].mean().reset_index()
    avg_points_scored.columns = ['season', 'team', prefix + 'avg_pts_scored']
    
    avg_points_allowed = data.groupby(['season', 'team'])['opponent_pts'].mean().reset_index()
    avg_points_allowed.columns = ['season', 'team', prefix + 'avg_pts_allowed']
    
    merged = pd.merge(total_points_scored, total_points_allowed, on=['season', 'team'], how='outer')
    merged = pd.merge(merged, avg_points_scored, on=['season', 'team'], how='outer')
    merged = pd.merge(merged, avg_points_allowed, on=['season', 'team'], how='outer')
    
    return merged

# Calculate points metrics for various situations
overall_points = calculate_points(game_data, 'overall_')
home_points = calculate_points(game_data[game_data['location'].isna()], 'home_')
away_points = calculate_points(game_data[game_data['location'] == '@'], 'away_')
conference_points = calculate_points(game_data[game_data['conference'] == game_data['game_type']], 'conference_')
non_conference_points = calculate_points(game_data[game_data['conference'] != game_data['game_type']], 'non_conference_')
blowout_points = calculate_points(game_data[game_data['point_diff'].abs() >= 21], 'blowout_')
close_points = calculate_points(game_data[game_data['point_diff'].abs() <= 7], 'close_')

# Merging points metrics to the final dataframe
final_df = pd.merge(final_df, overall_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, home_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, away_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, conference_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, non_conference_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, blowout_points, on=['season', 'team'], how='outer')
final_df = pd.merge(final_df, close_points, on=['season', 'team'], how='outer')

# Filling NaN values with 0
final_df = final_df.fillna(0)

final_df.head()


Unnamed: 0,season,team,conference,overall_loss,overall_tie,overall_win,overall_win_percentage,home_loss,home_win,home_win_percentage,...,non_conference_avg_pts_scored,non_conference_avg_pts_allowed,blowout_total_pts_scored,blowout_total_pts_allowed,blowout_avg_pts_scored,blowout_avg_pts_allowed,close_total_pts_scored,close_total_pts_allowed,close_avg_pts_scored,close_avg_pts_allowed
0,1869,Rutgers,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,3.0,6.0,0.0,0.0,0.0,0.0,6.0,4.0,6.0,4.0
1,1869,Princeton,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,6.0,3.0,0.0,0.0,0.0,0.0,4.0,6.0,4.0,6.0
2,1870,Rutgers,Independent/Unknown,1.0,0.0,1.0,50.0,1.0,1.0,50.0,...,4.0,4.5,0.0,0.0,0.0,0.0,8.0,9.0,4.0,4.5
3,1870,Princeton,Independent/Unknown,0.0,0.0,1.0,100.0,0.0,1.0,100.0,...,6.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,6.0,2.0
4,1872,Rutgers,Independent/Unknown,1.0,1.0,2.0,50.0,1.0,1.0,50.0,...,2.0,2.25,0.0,0.0,0.0,0.0,8.0,9.0,2.0,2.25


In [9]:
final_df.columns

# save to csv
final_df.to_csv('../data/cfb_yearly_summary_stats.csv', index=False)