In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
def from_csv(path):
    csv = pd.read_csv(path)
    return np.asarray(csv)

In [3]:
teams_csv = from_csv('./DataFiles/Teams.csv')

team_ids = {}

for index in range(0, len(teams_csv)):
    team = teams_csv[index]
    team_id = team[0]
    team_name = team[1]
    team_ids[team_id] = team_name

# allows us to map a team id to its name: team_ids[1101] => Abilene Chr

In [7]:
# Season, Day Num, Winning Team ID, Winning Score, Losing Team ID, Losing Score, Winner Location (Home/Away/Neu.),
# Num OT, WFGM (Field Goals made), WFGA (attempted), WFGM3 (three pointers made), WFGA3, WFTM (free throws made),
# WFTA (free throw attempted), WOR (offensive rebounds), WDR (defensive rebounds), WAst (assists), WTO (turnovers),
# WStl (steals), WBlk (blocks), WPF (personal fouls), LFGM, LFGA, LFGM3, LFGA3, LFTM, LFTA, LOR, LDR, LAst, LTO, LStl, 
# LBlk, LPF
game_history = np.concatenate((
    from_csv('./PrelimData2018/RegularSeasonDetailedResults_Prelim2018.csv'),
    from_csv('./DataFiles/NCAATourneyDetailedResults.csv')
))

In [28]:
ncaa_matchups = from_csv('./SampleSubmissionStage2.csv')[:, 0]

In [20]:
# team name, adj offensive efficiency, adj defensive efficiency, adj tempo, luck, adj margin of efficiency, 
# strength of schedule margin, avg opponent offense, avg opponent defense, non-conference, conf, year
kenpom_csv = from_csv("./kenpom.csv")

# transform conf to category
conference_ids = {}
unique_confs = np.unique(kenpom_csv[:, 10])
for index in range(0, len(unique_confs)):
    conference_ids[unique_confs[index]] = index

# year > team > stats
kenpom_team_stats_by_year = {}

for index in range(0, len(kenpom_csv)):
    team, offense, defense, tempo, luck, em, sos, oppoff, oppdef, nonconf, conf, year = kenpom_csv[index]
    
    if (not isinstance(kenpom_team_stats_by_year.get(year), dict)):
        kenpom_team_stats_by_year[year] = {}
    
    kenpom_team_stats_by_year[year][team] = [offense, defense, tempo, luck, em, sos, oppoff, oppdef, nonconf, conference_ids[conf]]
    
# kenpom_team_stats_by_year[2002]['Kent']

[116.7, 96.4, 64.7, 0.008, 20.39, 3.11, 106.3, 103.2, 2.99, 17]

In [24]:
kenpom_x_training_set = []
kenpom_y_training_set = []

for index in range(0, len(game_history)):
    row = game_history[index]
    year = row[0]
    wteam_id = row[2]
    lteam_id = row[4]
    wteam = team_ids[wteam_id]
    lteam = team_ids[lteam_id]
    wloc_str = row[6]

    if wloc_str == 'H':
        wloc = 1
        lloc = 0
    if wloc_str == 'N':
        wloc = 0.5
        lloc = 0.5
    if wloc_str == 'A':
        wloc = 0
        lloc = 1

    if (not isinstance(kenpom_team_stats_by_year.get(year), dict)):
        continue
    if (not isinstance(kenpom_team_stats_by_year[year].get(wteam), list)):
        continue
    if (not isinstance(kenpom_team_stats_by_year[year].get(lteam), list)):
        continue

    woffense, wdefense, wtempo, wluck, wem, wsos, woppoff, woppdef, wnonconf, wconf = kenpom_team_stats_by_year[year][wteam]
    loffense, ldefense, ltempo, lluck, lem, lsos, loppoff, loppdef, lnonconf, lconf = kenpom_team_stats_by_year[year][lteam]
        
    kenpom_x_training_set.append([
        woffense, wdefense, wtempo, wluck, wem, wsos, woppoff, woppdef, wnonconf, wconf, 
        loffense, ldefense, ltempo, lluck, lem, lsos, loppoff, loppdef, lnonconf, lconf,
        wloc
    ])
    kenpom_y_training_set.append(1)
    kenpom_x_training_set.append([
        loffense, ldefense, ltempo, lluck, lem, lsos, loppoff, loppdef, lnonconf, lconf,
        woffense, wdefense, wtempo, wluck, wem, wsos, woppoff, woppdef, wnonconf, wconf, 
        lloc
    ])
    kenpom_y_training_set.append(0)

X_train, X_test, y_train, y_test = train_test_split(kenpom_x_training_set, kenpom_y_training_set)

In [25]:
kenpom_classifier = LogisticRegression(solver = 'lbfgs', penalty='l2')
kenpom_classifier.fit(X_train, y_train)

y_pred = kenpom_classifier.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print("Test set score: {:.2f}".format(accuracy))

Test set score: 0.78


In [26]:
# from sklearn.ensemble import GradientBoostingClassifier
# kenpom_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=100, random_state=0)
# kenpom_classifier.fit(X_train, y_train)
# y_pred = kenpom_classifier.score(X_test, y_test)   

# print(y_pred)

# from sklearn.svm import SVC
# kenpom_classifier = SVC()

In [29]:
kaggle_predictions = [['ID', 'Pred']]

for index in range(0, len(ncaa_matchups)):
    _year, team_a_id, team_b_id = ncaa_matchups[index].split("_")
    team_a = team_ids[int(team_a_id)]
    team_b = team_ids[int(team_b_id)]
    
    aoffense, adefense, atempo, aluck, aem, asos, aoppoff, aoppdef, anonconf, aconf = kenpom_team_stats_by_year[year][team_a]
    boffense, bdefense, btempo, bluck, bem, bsos, boppoff, boppdef, bnonconf, bconf = kenpom_team_stats_by_year[year][team_b]
        
    pred = kenpom_classifier.predict_proba([[
        aoffense, adefense, atempo, aluck, aem, asos, aoppoff, aoppdef, anonconf, aconf,
        boffense, bdefense, btempo, bluck, bem, bsos, boppoff, boppdef, bnonconf, bconf,
        0.5
    ]])[0][1]
    
    kaggle_predictions.append(['_'.join(['2018', team_a_id, team_b_id]), pred])
    
pd.DataFrame(data=kaggle_predictions).to_csv('./kaggle_predictions_for_kenpom_model.csv')

In [30]:
for index in range(0, len(ncaa_matchups)):
    _year, team_a_id, team_b_id = ncaa_matchups[index].split("_")
    team_a = team_ids[int(team_a_id)]
    team_b = team_ids[int(team_b_id)]
    
    aoffense, adefense, atempo, aluck, aem, asos, aoppoff, aoppdef, anonconf, aconf = kenpom_team_stats_by_year[year][team_a]
    boffense, bdefense, btempo, bluck, bem, bsos, boppoff, boppdef, bnonconf, bconf = kenpom_team_stats_by_year[year][team_b]
        
    pred = kenpom_classifier.predict_proba([[
        aoffense, adefense, atempo, aluck, aem, asos, aoppoff, aoppdef, anonconf, aconf,
        boffense, bdefense, btempo, bluck, bem, bsos, boppoff, boppdef, bnonconf, bconf,
        0.5
    ]])[0][1]
    
    print("{} over {} chance {:.3f}%".format(team_a, team_b, pred * 100))

Alabama over Arizona chance 19.520%
Alabama over Arizona St chance 78.947%
Alabama over Arkansas chance 37.253%
Alabama over Auburn chance 62.843%
Alabama over Bucknell chance 64.058%
Alabama over Buffalo chance 79.939%
Alabama over Butler chance 24.911%
Alabama over Cincinnati chance 22.999%
Alabama over Clemson chance 43.663%
Alabama over Col Charleston chance 70.117%
Alabama over Creighton chance 29.317%
Alabama over CS Fullerton chance 95.469%
Alabama over Davidson chance 67.258%
Alabama over Duke chance 16.579%
Alabama over Florida chance 11.579%
Alabama over Florida St chance 25.800%
Alabama over Georgia St chance 81.909%
Alabama over Gonzaga chance 7.096%
Alabama over Houston chance 48.732%
Alabama over Iona chance 77.590%
Alabama over Kansas chance 11.136%
Alabama over Kansas St chance 33.314%
Alabama over Kentucky chance 10.978%
Alabama over Lipscomb chance 85.591%
Alabama over Long Island chance 94.967%
Alabama over Loyola-Chicago chance 72.286%
Alabama over Marshall chance 8