In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
regularseason = pd.read_csv('MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
postseason = pd.read_csv('MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')
rankings = pd.read_csv('MDataFiles_Stage1/MMasseyOrdinals.csv')
kenpom = rankings[(rankings['SystemName'] == 'POM') & (rankings['RankingDayNum'] >= 128)]

In [3]:
def add_rate_cols(df):
    
    df['possessions'] = .5 * (df['FGA'] + (.475 * df['FTA']) - df['OR'] + df['TO'])
    df['eFG'] = (df['FGM'] + (.5 * df['FGM3'])) / df['FGA']
    df['TS%'] = df['Score'] / (2 * (df['FGA'] + .475 * df['FTA']))
    df['FTr'] = df['FTA'] / df['FGA']
    df['3PAr'] = df['FGA3'] / (df['FGA'] + (.475*df['FTA']))
    df['OR%'] = df['OR'] / (df['OR'] + df['Opp_DR'])
    df['DR%'] = df['DR'] / (df['DR'] + df['Opp_OR'])
    df['REB%'] = (df['OR'] + df['DR']) / (df['OR'] + df['Opp_DR'] + df['Opp_OR'] + df['DR'])
    df['TR'] = (df['OR'] + df['DR'])
    df['ATOr'] = df['Ast'] / df['TO']
    df['Ast%'] = df['Ast'] / df['FGM']
    df['Stl%'] = df['Stl'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])
    df['Blk%'] = df['Blk'] / (df['Opp_FGA'])
    df['TO_r'] = df['TO'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_eFG'] = (df['Opp_FGM'] + (.5 * df['Opp_FGM3'])) / df['Opp_FGA']
    df['Opp_TSpct'] = df['Opp_Score'] / (2 * (df['Opp_FGA'] + .475 * df['Opp_FTA']))
    df['Opp_FTr'] = df['Opp_FTA'] / df['Opp_FGA']
    df['Opp_3Pr'] = df['Opp_FGA3'] / (df['Opp_FGA'] + (.475*df['Opp_FTA']))
    df['Opp_TR'] = (df['Opp_OR'] + df['Opp_DR'])
    df['Opp_ATOr'] = df['Opp_Ast'] / df['Opp_TO']
    df['Opp_Ast_%'] = df['Opp_Ast'] / df['Opp_FGM']
    df['Opp_Stl_%'] = df['Opp_Stl'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_Blk_%'] = df['Opp_Blk'] / (df['FGA'])
    df['Opp_TO_r'] = df['Opp_TO'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])

    return df

In [4]:
def team_regular_season(team_id,Season = all, DayNum = all, average = True):
    
    teamwins = regularseason[(regularseason['WTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)] 
    teamlosses = regularseason[(regularseason['LTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)]
    
    teamwinsoffense = teamwins.filter(like = 'W')    
    teamlossesoffense = teamlosses.filter(like = 'L')

    teamwinsoffense.columns = teamwinsoffense.columns.str.lstrip('W')
    teamlossesoffense.columns = teamlossesoffense.columns.str.lstrip('L')

    teamoffense = pd.concat([teamwinsoffense, teamlossesoffense])

    teamwinsopponents = teamwins.filter(like = 'L')
    teamlossesopponents = teamlosses.filter(like = 'W')

    teamwinsopponents.columns = teamwinsopponents.columns.str.lstrip('L')
    teamlossesopponents.columns = teamlossesopponents.columns.str.lstrip('W')

    teamopponents = pd.concat([teamwinsopponents, teamlossesopponents])
    
    teamtotal = pd.concat([teamoffense, teamopponents.add_prefix('Opp_')], axis = 1)
    
    teamtotal.drop('Opp_TeamID', axis = 1, inplace = True)
    
    if average == True:
        teamtotal = teamtotal.groupby('TeamID').mean().reset_index()
        add_rate_cols(teamtotal)
    
    if average == False:
        add_rate_cols(teamtotal)
        
    return teamtotal


In [5]:
start = time.time()
marquette = team_regular_season(1266, 2020, 132, True)
end = time.time()
print('Execution time = %.6f seconds' % (end-start))

Execution time = 0.072804 seconds


In [6]:
marquette.columns

Index(['TeamID', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR',
       'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'Opp_Score', 'Opp_FGM',
       'Opp_FGA', 'Opp_FGM3', 'Opp_FGA3', 'Opp_FTM', 'Opp_FTA', 'Opp_OR',
       'Opp_DR', 'Opp_Ast', 'Opp_TO', 'Opp_Stl', 'Opp_Blk', 'Opp_PF',
       'possessions', 'eFG', 'TS%', 'FTr', '3PAr', 'OR%', 'DR%', 'REB%', 'TR',
       'ATOr', 'Ast%', 'Stl%', 'Blk%', 'TO_r', 'Opp_eFG', 'Opp_TSpct',
       'Opp_FTr', 'Opp_3Pr', 'Opp_TR', 'Opp_ATOr', 'Opp_Ast_%', 'Opp_Stl_%',
       'Opp_Blk_%', 'Opp_TO_r'],
      dtype='object')

In [7]:
def get_KenPom(team_id, Year):
    rank = kenpom[(kenpom['TeamID'] == team_id) & (kenpom['Season'] == Year)]
    rank.reset_index(inplace = True)
    return rank.OrdinalRank

get_KenPom(1181, 2020)

0    5
Name: OrdinalRank, dtype: int64

In [8]:
def single_game(team1_id, team2_id, Year = 2020, DayNum = 132):
    
    team1 = team_regular_season(team1_id,Year,DayNum)
    team2 = team_regular_season(team2_id,Year,DayNum)
    
    team1Pom = get_KenPom(team1_id, Year)
    team2Pom = get_KenPom(team2_id, Year)
    
    fav_win = pd.DataFrame(team1Pom.lt(team2Pom))
    
    game = pd.concat([team1.add_prefix('W_'), team2.add_prefix('L_'), fav_win], axis = 1)
    game.rename(columns={game.columns[106]: 'fav_win'}, inplace = True)
    
    return game


In [9]:
duke_unc = single_game(1181, 1314, 2020, 10)
duke_unc


Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,1181,87.333333,34.0,69.0,7.0,21.333333,12.333333,18.333333,10.333333,29.333333,...,0.418107,0.178571,0.388477,39.5,1.647059,0.571429,0.083647,0.0,0.100666,True


In [10]:
duke_unc = single_game(1181, 1314, DayNum = 100)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win
0,1181,82.583333,30.125,62.791667,6.958333,19.833333,15.375,21.958333,12.875,26.958333,...,0.517736,0.287568,0.358779,34.916667,1.122807,0.528053,0.082791,0.063984,0.146231,True


In [24]:
def create_season_df(df, season):
    
    season_df = df[(df['Season'] == season) & (df['DayNum'] >= 12)]
    season_df.reset_index(inplace = True)
    
    matchups = list(zip(season_df.WTeamID, season_df.LTeamID, season_df.Season, season_df.DayNum, season_df.WLoc))
    
    season_games = []
    
    for i in range(len(matchups)):
        game = pd.concat([single_game(matchups[i][0], matchups[i][1], Year = matchups[i][2], DayNum = matchups[i][3]), season_df.WLoc], axis = 1)
        season_games.append(game)
    
    df = pd.concat(season_games, axis = 0).dropna()
    
    return df

In [27]:
###7.5 seconds to run
start_time = time.time()

regularseason18 = create_season_df(regularseason, 2018)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

NameError: name 'regularseason18' is not defined

In [26]:
postseason18

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_FTr,L_Opp_3Pr,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_%,L_Opp_Stl_%,L_Opp_Blk_%,L_Opp_TO_r,fav_win,WLoc
0,1347.0,66.937500,23.281250,54.843750,7.656250,21.812500,12.718750,17.343750,10.812500,23.000000,...,0.320964,0.297740,36.333333,1.098985,0.470652,0.079447,0.046210,0.141454,True,N
0,1382.0,77.906250,26.375000,57.843750,7.750000,19.468750,17.406250,23.093750,10.343750,25.218750,...,0.295355,0.347356,36.343750,1.286957,0.511521,0.070952,0.041242,0.128880,False,N
0,1393.0,67.545455,22.757576,54.424242,5.848485,18.181818,16.181818,21.848485,12.090909,25.333333,...,0.323529,0.345791,36.580645,1.012959,0.571255,0.064932,0.058351,0.176692,False,N
0,1411.0,77.647059,25.823529,58.500000,7.823529,21.529412,18.176471,25.294118,10.294118,25.882353,...,0.343750,0.327641,33.806452,1.177143,0.542105,0.062899,0.051587,0.143761,True,N
0,1104.0,72.352941,25.294118,55.205882,6.382353,19.676471,15.382353,22.882353,10.029412,26.264706,...,0.287206,0.378633,34.500000,1.060096,0.541769,0.068210,0.066814,0.160478,False,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1242.0,81.500000,30.117647,60.529412,10.147059,25.205882,11.117647,15.882353,9.558824,25.647059,...,0.220179,0.332505,32.727273,1.193955,0.556992,0.068604,0.043542,0.144900,False,N
0,1437.0,87.058824,31.029412,61.529412,11.411765,28.676471,13.588235,17.617647,9.382353,26.205882,...,0.380258,0.328733,31.757576,0.682635,0.476323,0.074531,0.064516,0.192245,True,N
0,1276.0,73.647059,26.647059,57.235294,8.970588,24.705882,11.382353,17.323529,8.647059,24.676471,...,0.257370,0.324300,30.687500,0.881517,0.510288,0.089648,0.059038,0.175713,True,N
0,1437.0,87.058824,31.029412,61.529412,11.411765,28.676471,13.588235,17.617647,9.382353,26.205882,...,0.248095,0.342922,35.147059,0.975113,0.485908,0.070441,0.058795,0.158453,True,N


In [None]:
postseason12 = create_season_df(postseason, 2012)
postseason13 = create_season_df(postseason, 2013)
postseason14 = create_season_df(postseason, 2014)
postseason15 = create_season_df(postseason, 2015)
postseason16 = create_season_df(postseason, 2016)
postseason17 = create_season_df(postseason, 2017)
postseason18 = create_season_df(postseason, 2018)
postseason19 = create_season_df(postseason, 2019)


In [None]:
start_time = time.time()

regularseason12 = create_season_df(regularseason, 2012)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

In [None]:
regularseason13 = create_season_df(regularseason, 2013)

In [None]:
regularseason14 = create_season_df(regularseason, 2014)

In [None]:
regularseason15 = create_season_df(regularseason, 2015)

In [None]:
regularseason16 = create_season_df(regularseason, 2016)

In [None]:
regularseason17 = create_season_df(regularseason, 2017)

In [None]:
regularseason18 = create_season_df(regularseason, 2018)

In [None]:
regularseason19 = create_season_df(regularseason, 2019)

In [None]:
regularseason20 = create_season_df(regularseason, 2020)

In [None]:
postseason12.to_csv('./data/postseason12.csv')
postseason13.to_csv('./data/postseason13.csv')
postseason14.to_csv('./data/postseason14.csv')
postseason15.to_csv('./data/postseason15.csv')
postseason16.to_csv('./data/postseason16.csv')
postseason17.to_csv('./data/postseason17.csv')
postseason18.to_csv('./data/postseason18.csv')
postseason19.to_csv('./data/postseason19.csv')

In [None]:
regularseason12.to_csv('./data/regularseason12.csv')
regularseason13.to_csv('./data/regularseason13.csv')
regularseason14.to_csv('./data/regularseason14.csv')
regularseason15.to_csv('./data/regularseason15.csv')
regularseason16.to_csv('./data/regularseason16.csv')
regularseason17.to_csv('./data/regularseason17.csv')
regularseason18.to_csv('./data/regularseason18.csv')
regularseason19.to_csv('./data/regularseason19.csv')
regularseason20.to_csv('./data/regularseason20.csv')