In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
regularseason = pd.read_csv('MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
postseason = pd.read_csv('MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')
rankings = pd.read_csv('MDataFiles_Stage1/MMasseyOrdinals.csv')
kenpom = rankings[(rankings['SystemName'] == 'POM') & (rankings['RankingDayNum'] >= 128)]

In [3]:
def add_rate_cols(df):
    
    df['possessions'] = .5 * (df['FGA'] + (.475 * df['FTA']) - df['OR'] + df['TO'])
    df['eFG'] = (df['FGM'] + (.5 * df['FGM3'])) / df['FGA']
    df['TS%'] = df['Score'] / (2 * (df['FGA'] + .475 * df['FTA']))
    df['FTr'] = df['FTA'] / df['FGA']
    df['3PAr'] = df['FGA3'] / (df['FGA'] + (.475*df['FTA']))
    df['OR%'] = df['OR'] / (df['OR'] + df['Opp_DR'])
    df['DR%'] = df['DR'] / (df['DR'] + df['Opp_OR'])
    df['REB%'] = (df['OR'] + df['DR']) / (df['OR'] + df['Opp_DR'] + df['Opp_OR'] + df['DR'])
    df['TR'] = (df['OR'] + df['DR'])
    df['ATOr'] = df['Ast'] / df['TO']
    df['Ast%'] = df['Ast'] / df['FGM']
    df['Stl%'] = df['Stl'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])
    df['Blk%'] = df['Blk'] / (df['Opp_FGA'])
    df['TO_r'] = df['TO'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_eFG'] = (df['Opp_FGM'] + (.5 * df['Opp_FGM3'])) / df['Opp_FGA']
    df['Opp_TSpct'] = df['Opp_Score'] / (2 * (df['Opp_FGA'] + .475 * df['Opp_FTA']))
    df['Opp_FTr'] = df['Opp_FTA'] / df['Opp_FGA']
    df['Opp_3Pr'] = df['Opp_FGA3'] / (df['Opp_FGA'] + (.475*df['Opp_FTA']))
    df['Opp_TR'] = (df['Opp_OR'] + df['Opp_DR'])
    df['Opp_ATOr'] = df['Opp_Ast'] / df['Opp_TO']
    df['Opp_Ast_%'] = df['Opp_Ast'] / df['Opp_FGM']
    df['Opp_Stl_%'] = df['Opp_Stl'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_Blk_%'] = df['Opp_Blk'] / (df['FGA'])
    df['Opp_TO_r'] = df['Opp_TO'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])

    return df

In [4]:
def team_regular_season(team_id,Season = all, DayNum = all, average = True):
    
    teamwins = regularseason[(regularseason['WTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)] 
    teamlosses = regularseason[(regularseason['LTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)]
    
    teamwinsoffense = teamwins.filter(like = 'W')    
    teamlossesoffense = teamlosses.filter(like = 'L')

    teamwinsoffense.columns = teamwinsoffense.columns.str.lstrip('W')
    teamlossesoffense.columns = teamlossesoffense.columns.str.lstrip('L')

    teamoffense = pd.concat([teamwinsoffense, teamlossesoffense])

    teamwinsopponents = teamwins.filter(like = 'L')
    teamlossesopponents = teamlosses.filter(like = 'W')

    teamwinsopponents.columns = teamwinsopponents.columns.str.lstrip('L')
    teamlossesopponents.columns = teamlossesopponents.columns.str.lstrip('W')

    teamopponents = pd.concat([teamwinsopponents, teamlossesopponents])
    
    teamtotal = pd.concat([teamoffense, teamopponents.add_prefix('Opp_')], axis = 1)
    
    teamtotal.drop('Opp_TeamID', axis = 1, inplace = True)
    
    if average == True:
        teamtotal = teamtotal.groupby('TeamID').mean().reset_index()
        add_rate_cols(teamtotal)
    
    if average == False:
        add_rate_cols(teamtotal)
        
    return teamtotal


In [5]:
start = time.time()
marquette = team_regular_season(1266, 2020, 132, True)
end = time.time()
print('Execution time = %.6f seconds' % (end-start))

Execution time = 0.056832 seconds


In [6]:
marquette.columns

Index(['TeamID', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR',
       'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'Opp_Score', 'Opp_FGM',
       'Opp_FGA', 'Opp_FGM3', 'Opp_FGA3', 'Opp_FTM', 'Opp_FTA', 'Opp_OR',
       'Opp_DR', 'Opp_Ast', 'Opp_TO', 'Opp_Stl', 'Opp_Blk', 'Opp_PF',
       'possessions', 'eFG', 'TS%', 'FTr', '3PAr', 'OR%', 'DR%', 'REB%', 'TR',
       'ATOr', 'Ast%', 'Stl%', 'Blk%', 'TO_r', 'Opp_eFG', 'Opp_TSpct',
       'Opp_FTr', 'Opp_3Pr', 'Opp_TR', 'Opp_ATOr', 'Opp_Ast_%', 'Opp_Stl_%',
       'Opp_Blk_%', 'Opp_TO_r'],
      dtype='object')

In [7]:
def get_KenPom(team_id, Year):
    rank = kenpom[(kenpom['TeamID'] == team_id) & (kenpom['Season'] == Year)]
    rank.reset_index(inplace = True)
    return rank.OrdinalRank

get_KenPom(1181, 2020)

0    5
Name: OrdinalRank, dtype: int64

In [8]:
def single_game(team1_id, team2_id, Year = 2020, DayNum = 132):
    
    team1 = team_regular_season(team1_id,Year,DayNum)
    team2 = team_regular_season(team2_id,Year,DayNum)
    
    team1Pom = get_KenPom(team1_id, Year)
    team2Pom = get_KenPom(team2_id, Year)
    
    fav_win = pd.DataFrame(team1Pom.lt(team2Pom))
    
    game = pd.concat([team1.add_prefix('W_'), team2.add_prefix('L_'), fav_win], axis = 1)
    game.rename(columns={game.columns[106]: 'fav_win'}, inplace = True)
    
    return game


In [52]:
duke_unc = single_game(1181, 1314, 2020, 0)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,,,,,,,,,,,...,,,,,,,,,,True


In [53]:
duke_unc = single_game(1181, 1314, 2020, 2)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,1181,68,23,64,8,24,14,23,11,19,...,,,,,,,,,,True


In [55]:
duke_unc = single_game(1181, 1314, 2020, 25)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,1181,83.857143,29.714286,66.285714,7.0,21.0,17.428571,26.285714,16.428571,27.857143,...,0.465617,0.184896,0.40936,32.0,1.085714,0.527778,0.078417,0.03866,0.143524,True


In [10]:
duke_unc = single_game(1181, 1314, DayNum = 100)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,1181,82.583333,30.125,62.791667,6.958333,19.833333,15.375,21.958333,12.875,26.958333,...,0.517736,0.287568,0.358779,34.916667,1.122807,0.528053,0.082791,0.063984,0.146231,True


In [44]:
def create_season_df(df, season):
    
    season_df = df[(df['Season'] == season) & (df['DayNum'] >= 12)]
    season_df.reset_index(inplace = True)
    
    matchups = list(zip(season_df.WTeamID, season_df.LTeamID, season_df.Season, season_df.DayNum))
    
    season_games = []
    
    for i in range(len(matchups)):
        game = single_game(matchups[i][0], matchups[i][1], Year = matchups[i][2], DayNum = matchups[i][3])
        season_games.append(game)
    
    df = pd.concat(season_games, axis = 0).dropna()
    df.reset_index(inplace = True, drop = True)
    df.insert(0,"w_loc", season_df.WLoc)
    
    return df

In [29]:
###7.5 seconds to run
start_time = time.time()

regularseason17 = create_season_df(regularseason, 2017)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

Execution time = 658.176695 seconds


In [37]:
regularseason17.shape

(5184, 108)

In [45]:
postseason12 = create_season_df(postseason, 2012)
postseason13 = create_season_df(postseason, 2013)
postseason14 = create_season_df(postseason, 2014)
postseason15 = create_season_df(postseason, 2015)
postseason16 = create_season_df(postseason, 2016)
postseason17 = create_season_df(postseason, 2017)
postseason18 = create_season_df(postseason, 2018)
postseason19 = create_season_df(postseason, 2019)


In [46]:
start_time = time.time()

regularseason12 = create_season_df(regularseason, 2012)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

Execution time = 627.307349 seconds


In [47]:
regularseason13 = create_season_df(regularseason, 2013)

In [48]:
regularseason14 = create_season_df(regularseason, 2014)

In [49]:
regularseason15 = create_season_df(regularseason, 2015)

In [50]:
postseason12.to_csv('./data/postseason12.csv')
postseason13.to_csv('./data/postseason13.csv')
postseason14.to_csv('./data/postseason14.csv')
postseason15.to_csv('./data/postseason15.csv')
postseason16.to_csv('./data/postseason16.csv')
postseason17.to_csv('./data/postseason17.csv')
postseason18.to_csv('./data/postseason18.csv')
postseason19.to_csv('./data/postseason19.csv')

In [51]:
regularseason12.to_csv('./data/regularseason12.csv')
regularseason13.to_csv('./data/regularseason13.csv')
regularseason14.to_csv('./data/regularseason14.csv')
regularseason15.to_csv('./data/regularseason15.csv')
