In [47]:
# Created by Ian Cox | 2/24/2023
# AHEAD | March Madness 2023 Bracket Challenge
# Tournament Prediction | Data prep 2
# data formatting in order to have a single csv file to use for ml
# create aggregate season stats for each team for every season

#data: MRegularSeasonDetailedResults.csv

import pandas as pd
import os
import glob

In [10]:
os.chdir('C:\\Users\\IanCox\\OneDrive - AHEAD\\Documents\\python\\march_madness\\mens-march-mania-2022\\MDataFiles_Stage1')

In [69]:
df = pd.read_csv('MRegularSeasonDetailedResults.csv')
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100418,2022,98,1400,79,1242,76,H,0,28,67,...,13,15,23,5,24,10,15,3,5,21
100419,2022,98,1411,66,1126,63,A,0,24,59,...,21,15,24,5,23,10,19,13,2,23
100420,2022,98,1422,68,1441,49,A,0,23,56,...,24,8,11,10,18,5,16,8,2,12
100421,2022,98,1438,69,1181,68,A,0,31,65,...,17,18,22,11,25,14,14,3,9,11


In [35]:
df.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [36]:
# check which seasons we have data for
df.Season.unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022], dtype=int64)

In [66]:
# function to aggregate points data for each team for a given year, export that data as a csv
def team_season_points(year):
    df = pd.read_csv('MRegularSeasonDetailedResults.csv')
    df = df[df.Season == year]
    
    points = df.groupby('WTeamID')['WScore'].sum() + df.groupby('LTeamID')['LScore'].sum()
    games = df.groupby('WTeamID').size() + df.groupby('LTeamID').size()
    avg_points = points / games
    
    
    result = pd.concat([points, avg_points], axis=1, keys=['Total Points', 'Average Points']).sort_values(by='Total Points', ascending=False)
    result = result.rename_axis('TeamID').reset_index()
    result['year'] = year
    result = result[['TeamID', 'Total Points', 'Average Points', 'year']]
    fstring = str('team_season_points_%s.csv' % year)
    result.to_csv(fstring, index=False)
    return result

In [67]:
# testing the output
team_season_points(2003)

Unnamed: 0,TeamID,Total Points,Average Points,year
0,1332,2631,82.218750,2003
1,1166,2615,79.242424,2003
2,1323,2483,80.096774,2003
3,1246,2481,77.531250,2003
4,1181,2459,81.966667,2003
...,...,...,...,...
322,1289,1446,60.250000,2003
323,1311,1442,53.407407,2003
324,1340,1426,57.040000,2003
325,1119,1342,53.680000,2003


In [70]:
# function to loop through each season, and create a df and csv for each season
for year in df.Season.unique():
    team_season_points(year)

In [71]:
# get all the csv files in the directory
all_files = glob.glob('team_season_points_*.csv')

# create an empty list to store dataframes
dfs = []

# loop through all the files and read them into dataframes
for file in all_files:
    df = pd.read_csv(file)
    dfs.append(df)

# concatenate all the dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [72]:
combined_df

Unnamed: 0,TeamID,Total Points,Average Points,year
0,1332,2631.0,82.218750,2003
1,1166,2615.0,79.242424,2003
2,1323,2483.0,80.096774,2003
3,1246,2481.0,77.531250,2003
4,1181,2459.0,81.966667,2003
...,...,...,...,...
6887,1167,909.0,60.600000,2022
6888,1271,893.0,63.785714,2022
6889,1175,,,2022
6890,1237,,,2022


In [73]:
# export the seasonal aggregate points into a single csv
combined_df.to_csv('team_yearly_points.csv',index=False)

## Summary

* In this script we use MRegularSeasonDetailedResults.csv from the kaggle data. This file contains many kpis from regular season play for each team for every season.

* The above script aggregates the stats for each team for every year, then outputs that year to a csv e.g. team_season_points_.csv.

* After it creates csvs for each year on record, it reads back in all of the csvs into dataframes, then stacks all of the dataframes together.

* After the dataframes are stacked the big dataframe containing all team aggreagated stats for all years are written out to a csv


### BE WARNED

* <b>We only aggreagated POINTS for each team.</b> 

* There is a TON more information in the MRegularSeasonDetailedResults.csv file that we could have used to create a more detailed history of each teams' seasons. 

* If you are looking to improve your model, spend much more time with MRegularSeasonDetailedResults.csv...