In [81]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

In [2]:
regularseason = pd.read_csv('MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
postseason = pd.read_csv('MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')
rankings = pd.read_csv('MDataFiles_Stage1/MMasseyOrdinals.csv')

In [68]:
def add_rate_cols(df):
    
    df['possessions'] = .5 * (df['FGA'] + (.475 * df['FTA']) - df['OR'] + df['TO'])
    df['eFG'] = (df['FGM'] + (.5 * df['FGM3'])) / df['FGA']
    df['TSpct'] = df['Score'] / (2 * (df['FGA'] + .475 * df['FTA']))
    df['FTR'] = df['FTA'] / df['FGA']
    df['3PR'] = df['FGA3'] / (df['FGA'] + (.475*df['FTA']))
    df['ORpct'] = df['OR'] / (df['OR'] + df['Opp_DR'])
    df['DRpct'] = df['DR'] / (df['DR'] + df['Opp_OR'])
    df['REBpct'] = (df['OR'] + df['DR']) / (df['OR'] + df['Opp_DR'] + df['Opp_OR'] + df['DR'])
    df['TR'] = (df['OR'] + df['DR'])
    df['ATOr'] = df['Ast'] / df['TO']
    df['Ast_pct'] = df['Ast'] / df['FGM']
    df['Stl_pct'] = df['Stl'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])
    df['Blk_pct'] = df['Blk'] / (df['Opp_FGA'])
    df['TO_ratio'] = df['TO'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_possessions'] = .5 * (df['Opp_FGA'] + (.475 * df['Opp_FTA']) - df['Opp_OR'] + df['Opp_TO'])
    df['Opp_eFG'] = (df['Opp_FGM'] + (.5 * df['Opp_FGM3'])) / df['Opp_FGA']
    df['Opp_TSpct'] = df['Opp_Score'] / (2 * (df['Opp_FGA'] + .475 * df['Opp_FTA']))
    df['Opp_FTR'] = df['Opp_FTA'] / df['Opp_FGA']
    df['Opp_3PR'] = df['Opp_FGA3'] / (df['Opp_FGA'] + (.475*df['Opp_FTA']))
    df['Opp_TR'] = (df['Opp_OR'] + df['Opp_DR'])
    df['Opp_ATOr'] = df['Opp_Ast'] / df['Opp_TO']
    df['Opp_Ast_pct'] = df['Opp_Ast'] / df['Opp_FGM']
    df['Opp_Stl_pct'] = df['Opp_Stl'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_Blk_pct'] = df['Opp_Blk'] / (df['FGA'])
    df['Opp_TO_ratio'] = df['Opp_TO'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])

    return df

In [77]:
def team_regular_season(team_id,Season = all, DayNum = all, average = True):
    
    teamwins = regularseason[(regularseason['WTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)] 
    teamlosses = regularseason[(regularseason['LTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)]
    
    teamwinsoffense = teamwins.filter(like = 'W')    
    teamlossesoffense = teamlosses.filter(like = 'L')

    teamwinsoffense.columns = teamwinsoffense.columns.str.lstrip('W')
    teamlossesoffense.columns = teamlossesoffense.columns.str.lstrip('L')

    teamoffense = pd.concat([teamwinsoffense, teamlossesoffense])

    teamwinsopponents = teamwins.filter(like = 'L')
    teamlossesopponents = teamlosses.filter(like = 'W')

    teamwinsopponents.columns = teamwinsopponents.columns.str.lstrip('L')
    teamlossesopponents.columns = teamlossesopponents.columns.str.lstrip('W')

    teamopponents = pd.concat([teamwinsopponents, teamlossesopponents])
    
    teamtotal = pd.concat([teamoffense, teamopponents.add_prefix('Opp_')], axis = 1)
    
    ###cols_to_drop = ['Opp_TeamID', ]
    
    if average == True:
        teamtotal = teamtotal.groupby('TeamID').mean().reset_index()
        add_rate_cols(teamtotal)
    
    if average == False:
        add_rate_cols(teamtotal)
        
    return teamtotal


In [79]:
marquette = team_regular_season(1266, 2020, 132, True)
marquette

Unnamed: 0,TeamID,Score,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,...,Opp_3PR,Opp_ORpct,Opp_DRpct,Opp_REBpct,Opp_TR,Opp_ATOr,Opp_Ast_pct,Opp_Stl_pct,Opp_Blk_pct,Opp_TO_ratio
0,1266,77.766667,25.3,58.866667,10.033333,26.233333,17.133333,23.066667,10.7,29.266667,...,0.298768,0.258446,0.698874,0.467111,35.033333,1.200608,0.521108,0.080951,0.052095,0.132957


In [57]:
kenpom = rankings[(rankings['SystemName'] == 'POM') & (rankings['RankingDayNum'] >= 128) & (rankings['Season'] == 2020)]

def get_KenPom(team_id, Year):
    rank = kenpom[(kenpom['TeamID'] == team_id)]
    rank.reset_index(inplace = True)
    return rank.OrdinalRank

get_KenPom(1181, 2020)

0    5
Name: OrdinalRank, dtype: int64

In [60]:
def single_game(team1_id, team2_id, Year = 2020, DayNum = 132):
    
    team1 = team_regular_season(team1_id,Year,DayNum)
    team2 = team_regular_season(team2_id,Year,DayNum)
    
    team1Pom = get_KenPom(team1_id, Year)
    team2Pom = get_KenPom(team2_id, Year)
    
    fav_win = team1Pom.lt(team2Pom)
    
    game = pd.concat([team1, team2, fav_win], axis = 1)
    game.rename(columns={game.columns[116]: 'fav_win'}, inplace = True)
    
    return game


In [61]:
duke_unc = single_game(1181, 1314, DayNum = 10)
#print(duke_unc.columns[117])
#print(duke_unc.columns[116])
duke_unc

Unnamed: 0,TeamID,Score,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,...,Opp_ORpct,Opp_DRpct,Opp_REBpct,Opp_TR,Opp_ATOr,Opp_Ast_pct,Opp_Stl_pct,Opp_Blk_pct,Opp_TO_ratio,fav_win
0,1181,87.333333,34.0,69.0,7.0,21.333333,12.333333,18.333333,10.333333,29.333333,...,0.202128,0.666667,0.429348,39.5,1.647059,0.571429,0.083647,0.0,0.100666,True


In [62]:
duke_unc = single_game(1181, 1314, DayNum = 100)
duke_unc

Unnamed: 0,TeamID,Score,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,...,Opp_ORpct,Opp_DRpct,Opp_REBpct,Opp_TR,Opp_ATOr,Opp_Ast_pct,Opp_Stl_pct,Opp_Blk_pct,Opp_TO_ratio,fav_win
0,1181,82.583333,30.125,62.791667,6.958333,19.833333,15.375,21.958333,12.875,26.958333,...,0.230516,0.64876,0.445982,34.916667,1.122807,0.528053,0.082791,0.063984,0.146231,True


In [63]:
def season_matchups(df, Season):

    schedule = df[(df['Season'] == Season) & (df['DayNum'] >= 12)]
    schedule.reset_index(inplace = True)
    
    matchups = list(zip(schedule.WTeamID,schedule.LTeamID,schedule.DayNum,schedule.Season))
    
    return matchups

In [64]:
matchups2020 = season_matchups(regularseason, 2020)


In [65]:
len(matchups2020)

4867

In [67]:
### warning 4.5 minutes to run
start_time = time.time()
games2020 = []

for i in range(len(matchups2020)):
    game = single_game(matchups2020[i][0], matchups2020[i][1], Year = matchups2020[i][3], DayNum = matchups2020[i][2])
    games2020.append(game)
    
games2020 = pd.concat(games2020, axis=0)    


0 0.13563823699951172
1 0.1107034683227539
2 0.12167620658874512
3 0.10073328018188477
4 0.11724519729614258
5 0.15658140182495117
6 0.12385678291320801
7 0.10172867774963379
8 0.1092231273651123
9 0.11371660232543945
10 0.10472369194030762
11 0.12366962432861328
12 0.10671305656433105
13 0.11568975448608398
14 0.12267589569091797
15 0.10073161125183105
16 0.1107029914855957
17 0.1356372833251953
18 0.13962459564208984
19 0.1007235050201416
20 0.11369585990905762
21 0.11726260185241699
22 0.12534737586975098
23 0.11971163749694824
24 0.11266756057739258
25 0.11369609832763672
26 0.10472393035888672
27 0.11269855499267578
28 0.10271620750427246
29 0.12267041206359863
30 0.12765169143676758
31 0.10072970390319824
32 0.12566637992858887
33 0.1216738224029541
34 0.12765955924987793
35 0.10272574424743652
36 0.12167668342590332
37 0.13164615631103516
38 0.16657733917236328
39 0.11369705200195312
40 0.12167978286743164
41 0.10073208808898926
42 0.1017293930053711
43 0.13322043418884277
44 0.

KeyboardInterrupt: 

In [55]:
len(games2020)

4867