In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

In [2]:
regularseason = pd.read_csv('MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
postseason = pd.read_csv('MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')
rankings = pd.read_csv('MDataFiles_Stage1/MMasseyOrdinals.csv')
kenpom = rankings[(rankings['SystemName'] == 'POM') & (rankings['RankingDayNum'] >= 128)]

In [3]:
def add_rate_cols(df):
    
    df['possessions'] = .5 * (df['FGA'] + (.475 * df['FTA']) - df['OR'] + df['TO'])
    df['eFG'] = (df['FGM'] + (.5 * df['FGM3'])) / df['FGA']
    df['TSpct'] = df['Score'] / (2 * (df['FGA'] + .475 * df['FTA']))
    df['FTR'] = df['FTA'] / df['FGA']
    df['3PAR'] = df['FGA3'] / (df['FGA'] + (.475*df['FTA']))
    df['ORpct'] = df['OR'] / (df['OR'] + df['Opp_DR'])
    df['DRpct'] = df['DR'] / (df['DR'] + df['Opp_OR'])
    df['REBpct'] = (df['OR'] + df['DR']) / (df['OR'] + df['Opp_DR'] + df['Opp_OR'] + df['DR'])
    df['TR'] = (df['OR'] + df['DR'])
    df['ATOr'] = df['Ast'] / df['TO']
    df['Ast_pct'] = df['Ast'] / df['FGM']
    df['Stl_pct'] = df['Stl'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])
    df['Blk_pct'] = df['Blk'] / (df['Opp_FGA'])
    df['TO_ratio'] = df['TO'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_eFG'] = (df['Opp_FGM'] + (.5 * df['Opp_FGM3'])) / df['Opp_FGA']
    df['Opp_TSpct'] = df['Opp_Score'] / (2 * (df['Opp_FGA'] + .475 * df['Opp_FTA']))
    df['Opp_FTR'] = df['Opp_FTA'] / df['Opp_FGA']
    df['Opp_3PR'] = df['Opp_FGA3'] / (df['Opp_FGA'] + (.475*df['Opp_FTA']))
    df['Opp_TR'] = (df['Opp_OR'] + df['Opp_DR'])
    df['Opp_ATOr'] = df['Opp_Ast'] / df['Opp_TO']
    df['Opp_Ast_pct'] = df['Opp_Ast'] / df['Opp_FGM']
    df['Opp_Stl_pct'] = df['Opp_Stl'] / (df['FGA'] + .475*df['FTA'] + df['TO'])
    df['Opp_Blk_pct'] = df['Opp_Blk'] / (df['FGA'])
    df['Opp_TO_ratio'] = df['Opp_TO'] / (df['Opp_FGA'] + .475*df['Opp_FTA'] + df['Opp_TO'])

    return df

In [4]:
def team_regular_season(team_id,Season = all, DayNum = all, average = True):
    
    teamwins = regularseason[(regularseason['WTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)] 
    teamlosses = regularseason[(regularseason['LTeamID'] == team_id) & (regularseason['Season'] == Season) & (regularseason['DayNum'] < DayNum)]
    
    teamwinsoffense = teamwins.filter(like = 'W')    
    teamlossesoffense = teamlosses.filter(like = 'L')

    teamwinsoffense.columns = teamwinsoffense.columns.str.lstrip('W')
    teamlossesoffense.columns = teamlossesoffense.columns.str.lstrip('L')

    teamoffense = pd.concat([teamwinsoffense, teamlossesoffense])

    teamwinsopponents = teamwins.filter(like = 'L')
    teamlossesopponents = teamlosses.filter(like = 'W')

    teamwinsopponents.columns = teamwinsopponents.columns.str.lstrip('L')
    teamlossesopponents.columns = teamlossesopponents.columns.str.lstrip('W')

    teamopponents = pd.concat([teamwinsopponents, teamlossesopponents])
    
    teamtotal = pd.concat([teamoffense, teamopponents.add_prefix('Opp_')], axis = 1)
    
    ###cols_to_drop = ['Opp_TeamID', ]
    
    if average == True:
        teamtotal = teamtotal.groupby('TeamID').mean().reset_index()
        add_rate_cols(teamtotal)
    
    if average == False:
        add_rate_cols(teamtotal)
        
    return teamtotal


In [5]:
start = time.time()
marquette = team_regular_season(1266, 2020, 132, True)
end = time.time()
print('Execution time = %.6f seconds' % (end-start))

Execution time = 0.045843 seconds


In [6]:
def get_KenPom(team_id, Year):
    rank = kenpom[(kenpom['TeamID'] == team_id) & (kenpom['Season'] == Year)]
    rank.reset_index(inplace = True)
    return rank.OrdinalRank

get_KenPom(1181, 2020)

0    5
Name: OrdinalRank, dtype: int64

In [7]:
def single_game(team1_id, team2_id, Year = 2020, DayNum = 132):
    
    team1 = team_regular_season(team1_id,Year,DayNum)
    team2 = team_regular_season(team2_id,Year,DayNum)
    
    team1Pom = get_KenPom(team1_id, Year)
    team2Pom = get_KenPom(team2_id, Year)
    
    fav_win = team1Pom.lt(team2Pom)
    
    game = pd.concat([team1.add_prefix('W_'), team2.add_prefix('L_'), fav_win], axis = 1)
    game.rename(columns={game.columns[108]: 'fav_win'}, inplace = True)
    
    return game


In [8]:
duke_unc = single_game(1181, 1314, DayNum = 10)
#print(duke_unc.columns[117])
#print(duke_unc.columns[116])
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTR,L_Opp_3PR,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_pct,L_Opp_Stl_pct,L_Opp_Blk_pct,L_Opp_TO_ratio,fav_win
0,1181,87.333333,34.0,69.0,7.0,21.333333,12.333333,18.333333,10.333333,29.333333,...,0.418107,0.178571,0.388477,39.5,1.647059,0.571429,0.083647,0.0,0.100666,True


In [9]:
duke_unc = single_game(1181, 1314, DayNum = 100)
duke_unc

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTR,L_Opp_3PR,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_pct,L_Opp_Stl_pct,L_Opp_Blk_pct,L_Opp_TO_ratio,fav_win
0,1181,82.583333,30.125,62.791667,6.958333,19.833333,15.375,21.958333,12.875,26.958333,...,0.517736,0.287568,0.358779,34.916667,1.122807,0.528053,0.082791,0.063984,0.146231,True


In [10]:
def matchups(df, season):

    schedule = df[(df['Season'] == season) & (df['DayNum'] >= 12)]
    schedule.reset_index(inplace = True)
    
    matchups = list(zip(schedule.WTeamID, schedule.LTeamID, schedule.DayNum, schedule.Season))
    
    return matchups

In [16]:
def create_season_df(df, season):
    
    season_df = df[(df['Season'] == season) & (df['DayNum'] >= 12)]
    season_df.reset_index(inplace = True)
    
    matchups = list(zip(season_df.WTeamID, season_df.LTeamID, season_df.Season, season_df.DayNum))
    season_games = []
    for i in range(len(matchups)):
        game = single_game(matchups[i][0], matchups[i][1], Year = matchups[i][2], DayNum = matchups[i][3])
        season_games.append(game)
    df = pd.concat(season_games, axis=0) 
    return df    

In [17]:
### warning 7.5 minutes to run
start_time = time.time()

regular_18 = create_season_df(regularseason, 2018)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

Execution time = 412.651098 seconds


In [23]:
regular_18.dropna()

Unnamed: 0,W_TeamID,W_Score,W_FGM,W_FGA,W_FGM3,W_FGA3,W_FTM,W_FTA,W_OR,W_DR,...,L_Opp_TSpct,L_Opp_FTR,L_Opp_3PR,L_Opp_TR,L_Opp_ATOr,L_Opp_Ast_pct,L_Opp_Stl_pct,L_Opp_Blk_pct,L_Opp_TO_ratio,fav_win
0,1181.0,97.000000,41.000000,74.000000,12.000000,25.000000,3.000000,11.000000,15.000000,33.000000,...,0.474488,0.348485,0.194995,38.000000,1.214286,0.653846,0.130101,0.142857,0.153973,True
0,1112.0,101.000000,34.000000,57.000000,5.000000,13.000000,28.000000,32.000000,12.000000,31.000000,...,0.612967,0.744681,0.361493,34.000000,1.272727,0.636364,0.053050,0.120000,0.147404,True
0,1116.0,95.000000,36.000000,63.000000,9.000000,22.000000,14.000000,22.000000,7.000000,28.000000,...,0.605364,0.588235,0.183908,38.000000,0.384615,0.200000,0.048736,0.060606,0.166134,True
0,1130.0,85.000000,33.000000,68.000000,4.000000,18.000000,15.000000,19.000000,14.000000,37.000000,...,0.658404,0.321429,0.356313,41.000000,1.384615,0.562500,0.094244,0.053571,0.167634,True
0,1143.0,66.000000,19.000000,62.000000,4.000000,16.000000,24.000000,39.000000,14.000000,32.000000,...,0.610090,0.433962,0.297223,41.000000,0.812500,0.500000,0.079708,0.066667,0.200188,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1153.0,75.030303,26.454545,58.363636,7.393939,20.636364,14.727273,21.393939,13.181818,25.909091,...,0.504907,0.411458,0.317080,31.593750,0.832941,0.518302,0.063188,0.040569,0.170633,True
0,1172.0,77.032258,27.612903,56.870968,10.838710,27.612903,10.967742,13.806452,6.935484,25.903226,...,0.545835,0.409371,0.252916,33.806452,0.717742,0.485014,0.069337,0.049895,0.203830,False
0,1209.0,74.419355,26.451613,57.419355,9.258065,23.806452,12.258065,18.064516,8.677419,24.903226,...,0.524611,0.346550,0.386439,35.843750,0.937634,0.540273,0.082115,0.041365,0.170537,True
0,1246.0,76.727273,27.121212,57.818182,5.393939,15.060606,17.090909,24.666667,12.121212,26.666667,...,0.507752,0.380601,0.308398,34.406250,0.773333,0.484680,0.070653,0.056131,0.177744,False


In [26]:
regular_18.dropna().duplicated().value_counts()

False    5197
dtype: int64

In [29]:
regularseason[(regularseason.Season == 2018) & (regularseason.DayNum >= 12)]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
76749,2018,12,1103,67,1156,57,H,0,20,66,...,19,10,26,4,37,7,16,5,4,20
76750,2018,12,1133,68,1237,53,H,0,25,56,...,20,17,20,13,22,8,12,6,1,17
76751,2018,12,1138,80,1145,75,H,0,30,73,...,21,17,19,7,25,13,19,4,6,20
76752,2018,12,1140,91,1290,61,H,0,33,56,...,17,7,10,9,19,11,6,3,0,19
76753,2018,12,1181,99,1430,69,H,0,39,73,...,25,13,16,16,23,13,19,5,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82036,2018,132,1153,56,1222,55,N,0,20,46,...,23,10,12,9,20,12,12,8,2,19
82037,2018,132,1172,58,1348,57,N,0,19,50,...,23,9,17,11,26,12,14,3,4,16
82038,2018,132,1209,74,1426,61,N,0,25,56,...,25,23,28,19,24,9,13,1,5,16
82039,2018,132,1246,77,1397,72,N,0,25,50,...,27,16,20,18,16,14,10,4,3,21
