In [1]:
import seaborn as sns
import pandas as pd
import numpy as np 
import re
pd.set_option('display.max_columns', 500)

In [2]:
#open csv, drop unnecessary columns
nba = pd.read_csv("../nba.csv")
nba = nba.drop(columns = ["Unnamed: 6", "Start (ET)","Notes","Unnamed: 7"])


In [3]:
#clean date and turn to date_time type
nba.Date = nba.Date.str.replace("^[A-z]{3}","-")
nba.Date.str.lstrip("- ")
nba.Date = nba.Date.str.replace(" ","-").str.lstrip("-")
nba.Date = pd.to_datetime(nba.Date)

In [4]:
#renaming columns
nba = nba.rename(columns = {"PTS":"AwayPTS", "PTS.1":"HomePTS", "Visitor/Neutral":"Away","Home/Neutral":"Home", "Attend.":"Attend"})

In [5]:
#creating homewins columns
nba["HomeWin"] = np.where(nba["HomePTS"] > nba["AwayPTS"], 1,0)
#nba.head(20)

In [6]:
nbatest = nba[["Away","Home","HomeWin"]]

In [None]:
from sklearn import preprocessing
dummies = pd.get_dummies(nbatest[["Away","Home"]])
nbatest[dummies.columns] = dummies

In [None]:
nbatest = nbatest.drop(columns = ["Home","Away"])


In [None]:
#defining functions that check the accuracy score of our predictions using different models
#Random Forests
def RFscore(nbatest):
    X_train = nbatest[:900].drop(columns = "HomeWin")    
    y_train = nbatest["HomeWin"][:900]
    X_test = nbatest[900:].drop(columns = "HomeWin")
    y_test = nbatest["HomeWin"][900:]
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import make_classification
    mod = RandomForestClassifier(n_estimators = 100)
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test,y_pred)


#SVC
def SVCscore(nbatest):
    #train data/ test data
    X_train = nbatest[:900].drop(columns = "HomeWin")    
    y_train = nbatest["HomeWin"][:900]
    X_test = nbatest[900:].drop(columns = "HomeWin")
    y_test = nbatest["HomeWin"][900:]
    
    #fit model
    from sklearn.svm import SVC
    svc = SVC(gamma='auto')
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test,y_pred)


#LogisticRegression
def LRscore(nbatest):
    X_train = nbatest[:900].drop(columns = "HomeWin")    
    y_train = nbatest["HomeWin"][:900]
    X_test = nbatest[900:].drop(columns = "HomeWin")
    y_test = nbatest["HomeWin"][900:]
    
    from sklearn.linear_model import LogisticRegression
    mod = LogisticRegression()
    mod.fit(X_train,y_train)
    y_pred = mod.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test,y_pred)

In [None]:
#Adding "time on the road" feature
nbatest["Timeaway"] = 0
away_counts = {}
for i in nba["Away"].unique():
    away_counts[i] = 0
for i in range(len(nba.Away)):
    away_counts[nba.Away[i]] +=1
    away_counts[nba.Home[i]] = 0
    nbatest.iloc[i,-1] = away_counts[nba.Away[i]]

print("Logistic Regression Accuracy Score: ",LRscore(nbatest),"\nSVC Accuracy score:",SVCscore(nbatest),"\nRandom Forests Accuracy score:",RFscore(nbatest))

In [None]:
#adding Home win streak and Away win streak features
nbatest["HomeStreak"] = 0
nbatest["AwayStreak"] = 0
win_counts = {}
for i in nba["Away"].unique():
    win_counts[i] = 0
    
for row in range(len(nba)):
    if nbatest["HomeWin"][row] == 1:
        win_counts[nba["Home"][row]] +=1
        win_counts[nba["Away"][row]] == 0
    else:
        win_counts[nba["Away"][row]] +=1
        win_counts[nba["Home"][row]] == 0
    nbatest.iloc[row,-2] = win_counts[nba["Home"][row]]
    nbatest.iloc[row,-1] = win_counts[nba["Away"][row]]
    
print("Logistic Regression Accuracy Score: ",LRscore(nbatest),"\nSVC Accuracy score:",SVCscore(nbatest),"\nRandom Forests Accuracy score:",RFscore(nbatest))

In [None]:
#RANKINGS
#Creates dictionary with team and their rankings
feb_ranks = pd.read_csv("feb_ranks.csv")
feb_ranks["Team"] = feb_ranks["Western Conference"]
rankdict = {}
for i in range(len(feb_ranks.Team)):
    rankdict[feb_ranks.Team[i]] = feb_ranks.Rk[i]


In [None]:
#Creates HomeRank and AwayRank for each matchup in nba dataset
nbatest["HomeRank"] = 0
nbatest["AwayRank"] = 0
for i in range(len(nbatest.HomeRank)):
    #Setting AwayRank for row i
    nbatest.iloc[i,-1] = rankdict[nba.Away[i]]
    #Setting HomeRank for row i
    nbatest.iloc[i,-2] = rankdict[nba.Home[i]]

In [None]:
rankspread = nbatest["HomeRank"]-nbatest["AwayRank"]
nbatest["HomeRanksHigher"]= 0
#if rankspread is positive, home is better
#if rankspread is negative, away is better
for i in range(len(nbatest)):
    if rankspread[i] > 0:
        nbatest.iloc[i,-1] = 1
    else:
        nbatest.iloc[i,-1] = 0
        

print("Logistic Regression Accuracy Score: ",LRscore(nbatest),"\nSVC Accuracy score:",SVCscore(nbatest),"\nRandom Forests Accuracy score:",RFscore(nbatest))

In [None]:
rankspread = nbatest["HomeRank"]-nbatest["AwayRank"]
nbatest["HomeisFav"]= 0
for i in range(len(nbatest)):
    if rankspread[i] > 0 and rankspread[i] >13:
        nbatest.iloc[i,-1] = 1
    elif rankspread[i]<0 and abs(rankspread[i])>13:
        nbatest.iloc[i,-1] = 0
    else:
        nbatest.iloc[i,-1] = 0
        
print("Logistic Regression Accuracy Score: ",LRscore(nbatest),"\nSVC Accuracy score:",SVCscore(nbatest),"\nRandom Forests Accuracy score:",RFscore(nbatest))