In [None]:
import pandas as pd
import numpy as np 


In [None]:
#import any data set from nba_data folder
path = '../nba_data/sched1819.csv'
global nba_original
nba_original = pd.read_csv(path)

In [None]:
'''
Data Cleaning
'''

In [None]:
def clean(df2, path):
    #cleaning columns
    df2 = df2[df2['Home/Neutral'] != 'Home/Neutral']
    df2 = df2.reset_index(drop = True)
    df = df2.copy()
    df['HomeWin'] = np.where(df['PTS'] < df['PTS.1'], 1, 0)
    df.rename(columns = {'Visitor/Neutral':'Away', 'Home/Neutral' : 'Home'}, inplace = True)
    
    #one hot encoding
    from sklearn import preprocessing
    dummies = pd.get_dummies(df[["Away","Home"]])
    df[dummies.columns] = dummies
    
    #drop columns
    df.drop(columns = ['Date', 'Start (ET)','Unnamed: 6','Unnamed: 7','Attend.','Notes', 'PTS', 'PTS.1', 'Away','Home'], inplace = True)
    

    return df, df2

In [None]:
'''
Feature Engineering
'''

In [None]:
def win_streak(df, original):
    #empty dictionary must be created to keep loss counts 
    team_counts = {team: 0 for team in original['Visitor/Neutral'].unique()}

    for i in range(len(original)):
        x = original.loc[i,'Visitor/Neutral']
        y = team_counts[x]
        df.loc[i,'Away_win_streak'] = team_counts[original.loc[i,'Visitor/Neutral']]
        df.loc[i,'Home_win_streak'] = team_counts[original.loc[i,'Home/Neutral']]
        if df.iloc[i,-3] == 1:
            team_counts[original.loc[i,'Visitor/Neutral']] = 0
            team_counts[original.loc[i,'Home/Neutral']] += 1
        else:
            team_counts[original.loc[i,'Visitor/Neutral']] += 1 
            team_counts[original.loc[i,'Home/Neutral']] = 0 
            
    return df
            

In [None]:
def lose_streak(df, original):
    #empty dictionary must be created to keep loss counts 
    lose_counts = {team: 0 for team in original['Visitor/Neutral'].unique()}
    
    for i in range(len(original)):
        df.loc[i,'Away_lose_streak'] = lose_counts[original.loc[i,'Visitor/Neutral']]
        df.loc[i,'Home_lose_streak'] = lose_counts[original.loc[i,'Home/Neutral']]
        if df.loc[i,'HomeWin'] == 1:
            lose_counts[original.loc[i,'Visitor/Neutral']] == 1
            lose_counts[original.loc[i, 'Home/Neutral']] == 0
        else:
            lose_counts[original.loc[i,'Visitor/Neutral']] == 0
            lose_counts[original.loc[i, 'Home/Neutral']] == 1
    
    return df 

        

In [None]:
def timeaway(df, original):
    df["Timeaway"] = 0
    away_counts = {}
    for i in original["Visitor/Neutral"].unique():
        away_counts[i] = 0
    for i in range(len(original)):
        away_counts[original["Visitor/Neutral"][i]] +=1
        away_counts[original["Home/Neutral"][i]] = 0
        df.iloc[i,-1] = away_counts[original["Visitor/Neutral"][i]]
        
    return df

In [None]:
def allstars(df, original):
    
    allstar_count = {}
    for i in original["Visitor/Neutral"].unique():
        allstar_count[i] = 0
    #West
    allstar_count['Dallas Mavericks'] = 1
    allstar_count['Houston Rockets'] = 2
    allstar_count[ 'Los Angeles Clippers'] = 2
    allstar_count['Los Angeles Lakers'] = 2
    allstar_count['Portland Trail Blazers'] = 1
    allstar_count['Utah Jazz'] = 1
    allstar_count['Minnesota Timberwolves'] = 1
    allstar_count['Denver Nuggets'] = 1
    allstar_count['Phoenix Suns'] = 1


    #East
    allstar_count['Atlanta Hawks'] = 1
    allstar_count['Boston Celtics'] = 2
    allstar_count['Toronto Raptors'] = 1
    allstar_count['Milwaukee Bucks'] = 2
    allstar_count['Philadelphia 76ers'] = 2
    allstar_count['Miami Heat'] = 2
    allstar_count['Washington Wizards'] = 1
    allstar_count['Indiana Pacers'] = 1
    
    for i in range(len(original)):
        df.loc[i,'Away_Allstar_Count'] = allstar_count[original.loc[i,'Visitor/Neutral']]
        df.loc[i,'Home_Allstar_Count'] = allstar_count[original.loc[i,'Home/Neutral']]
        
    return df
    
    
    

In [1]:
#Scale features before fitting to model
def scale(features, df):
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler().fit(df[features])
    scaled = scaler.transform(df[features])
    df[features]= scaled
    return df

In [3]:
'''
Model Fitting and Evaluation
'''

'\nModel Fitting and Evaluation\n'

In [4]:
def fit(df, model):
    X_train = df[:984].drop(columns = "HomeWin")    
    y_train = df["HomeWin"][:984]
    X_test = df[984:].drop(columns = "HomeWin")
    y_test = df["HomeWin"][984:]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test,y_pred)


In [None]:
def run_fit(df):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    logistic = fit(df, LogisticRegression())
    support_vector = fit(df, SVC(gamma='auto'))
    rand_forest = fit(df, RandomForestClassifier(n_estimators = 100))
    print(f'Logistic Regression: {logistic} \n Support Vector Classifier: {support_vector} \n Random Forest Classifier: {rand_forest} ') 
    

In [None]:
def fit_and_predict(df, model,):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    logistic = fit(df, LogisticRegression())
    support_vector = fit(df, SVC(gamma='auto'))
    rand_forest = fit(df, RandomForestClassifier(n_estimators = 100))
    

In [None]:
#Call Functions
cleaned, nba_original = clean(nba_original, path)
df_unscaled = allstars(timeaway(lose_streak(win_streak(cleaned, nba_original), nba_original),nba_original),nba_original)
feats = df_unscaled.iloc[:,-7:].columns
df_scaled = scale(feats, df_unscaled)
run_fit(df_scaled)
