In [3]:
import pandas as pd
from IPython.display import display, HTML
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import cufflinks as cf
cf.set_config_file(offline=True)

In [4]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Teams

In [18]:
nba_teams = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/NBA_Stats_2010-11_2020-21.csv')
nba_teams = nba_teams.drop(['Unnamed: 0'], axis=1)

In [6]:
nba_teams = nba_teams.drop(['Arena', 'Attend.', 'Attend./G','SOS','SRS', 'Pace','L','W/L%','GB','G','MP','PL'], axis=1)

In [7]:
good_team_columns = nba_teams._get_numeric_data().dropna(axis=1)
good_team_columns = good_team_columns.drop(['Year'], axis=1)

In [19]:
import numpy as np
import statsmodels.api as sm

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included


# Stepwise teams 

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(good_team_columns[['PTS','3PA', '2PA','NRtg','PW','FG%','ORtg', 'DRtg','ORB', 'DRB']],good_team_columns[['W']],test_size=0.2, random_state=42)

In [10]:
from sklearn.linear_model import LinearRegression
import math
from sklearn.metrics import mean_squared_error
lr = LinearRegression() 
lr.fit(x_train, y_train) 
predictions = lr.predict(x_test) 

lr_confidence = lr.score(x_test, y_test)
print("lr confidence (R^2): ", lr_confidence)

print("Root Mean Squared Error (MSE): ",math.sqrt(mean_squared_error(y_test, predictions)))
print(predictions)
print(y_test)

lr confidence (R^2):  0.9546295240761689
Root Mean Squared Error (MSE):  2.6188864244663805
[[26.65609993]
 [30.67933377]
 [52.06841414]
 [19.50989444]
 [55.70034464]
 [30.22890377]
 [34.28263484]
 [41.24873596]
 [52.28237886]
 [22.03259476]
 [46.71858469]
 [37.13058053]
 [40.30151366]
 [52.00431228]
 [33.78971498]
 [66.95406584]
 [48.92563145]
 [40.95856456]
 [21.86884164]
 [56.22910518]
 [31.02492904]
 [42.32575903]
 [24.85812811]
 [33.03433711]
 [25.00836374]
 [38.78867955]
 [45.33734618]
 [58.48666733]
 [41.24892526]
 [33.63956144]
 [56.42430397]
 [45.05011651]
 [45.96703759]
 [67.99886519]
 [35.75107685]
 [14.00350102]
 [40.60775753]
 [31.99975151]
 [40.23565718]
 [21.99887687]
 [41.43344316]
 [39.6329079 ]
 [50.21773536]
 [46.07846933]
 [34.4270465 ]
 [46.6705507 ]
 [40.6709295 ]
 [15.58777925]
 [18.18603959]
 [44.2804248 ]
 [41.80486791]
 [52.87579741]
 [46.50673697]
 [42.21737057]
 [38.78223008]
 [26.17829296]
 [31.12227962]
 [31.89495464]
 [37.57349037]
 [59.00295836]
 [13.755

In [11]:
y_test['Preds'] = predictions

In [12]:
px.scatter(y_test, x=y_test['Preds'], y=y_test['W'], height=400)

# Hand picked stats

In [13]:
x_trains, x_tests, y_trains, y_tests = train_test_split(good_team_columns[['FTA','3PA', '2PA','FGA','ORB%', 'DRB%','ORtg', 'DRtg','TOV%','STL','BLK']],good_team_columns[['W']],test_size=0.2, random_state=42)

In [14]:
lrh = LinearRegression() 
lrh.fit(x_trains, y_trains) 
predictions_hand = lrh.predict(x_tests) 

lrh_confidence = lrh.score(x_tests, y_tests)
print("lr confidence (R^2): ", lrh_confidence)

print("Root Mean Squared Error (RMSE): ",math.sqrt(mean_squared_error(y_tests, predictions_hand)))
print(predictions_hand)
print(y_tests)

lr confidence (R^2):  0.8742941868609949
Root Mean Squared Error (RMSE):  4.359208861863496
[[26.68077516]
 [28.64894412]
 [51.06320219]
 [15.7524079 ]
 [53.24784263]
 [29.97243318]
 [36.27055083]
 [40.52487272]
 [48.63965907]
 [21.88033837]
 [45.35841031]
 [40.31681626]
 [43.18397416]
 [51.4654565 ]
 [33.80157924]
 [67.28250318]
 [46.6732872 ]
 [41.13760996]
 [20.77729592]
 [50.66679286]
 [29.29640275]
 [39.72370803]
 [31.58227675]
 [34.68266525]
 [24.69765908]
 [42.59937487]
 [44.20863282]
 [56.10190874]
 [37.22173036]
 [38.04179814]
 [54.55472092]
 [44.92708868]
 [45.03974424]
 [68.61679604]
 [36.59633686]
 [15.76677462]
 [48.2349635 ]
 [38.47343999]
 [40.55465953]
 [21.60352522]
 [43.58366142]
 [47.00059227]
 [46.06570764]
 [42.85828352]
 [39.07453811]
 [52.02355872]
 [38.32741836]
 [16.74586316]
 [23.0549083 ]
 [42.56388914]
 [36.69624935]
 [49.33742546]
 [44.52235068]
 [39.07819768]
 [36.74232832]
 [27.93425565]
 [33.23256226]
 [32.94445025]
 [35.26338305]
 [52.72432967]
 [11.780

In [15]:
y_tests['Preds'] = predictions_hand

In [16]:
px.scatter(y_tests, x=y_tests['Preds'], y=y_tests['W'], height=400)

# Player clustering K means

In [5]:

nba_players = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/NBA_player_stats_2010_2021.csv')
nba_players = nba_players.drop(['Year.1','Age'], axis=1)
#nba_players['Player'] = nba_players['Player'].str.replace(r"\(.*\)","")

In [6]:
nba_players = nba_players[nba_players.MP.astype(float) > 16]
nba_players = nba_players.set_index('Player').fillna(0)

In [7]:
nba_players = nba_players.drop(['Pos','Tm','G','GS'],axis=1)

In [8]:
from sklearn.cluster import KMeans

In [14]:
player_list = []
for C in list(nba_players.Year.unique()):
    player_year = nba_players[nba_players.Year == C]
    player_list.append(player_year)
player_kmeans = []
for y in player_list:
    kmeans_models = KMeans(n_clusters=5, random_state=1)
    good_columnss = y._get_numeric_data().dropna(axis=1)
    year_player = kmeans_models.fit(good_columnss)

    label = kmeans_models.labels_
    player_kmeans.append(label)

In [23]:
players_2010 = nba_players.loc[nba_players['Year'] == 2010]
players_2010['cluster'] = player_kmeans[0]


players_2011 = nba_players.loc[nba_players['Year'] == 2011]
players_2011['cluster'] = player_kmeans[1]


players_2012 = nba_players.loc[nba_players['Year'] == 2012]
players_2012['cluster'] = player_kmeans[2]


players_2013 = nba_players.loc[nba_players['Year'] == 2013]
players_2013['cluster'] = player_kmeans[3]


players_2014 = nba_players.loc[nba_players['Year'] == 2014]
players_2014['cluster'] = player_kmeans[4]


players_2015 = nba_players.loc[nba_players['Year'] == 2015]
players_2015['cluster'] = player_kmeans[5]


players_2016 = nba_players.loc[nba_players['Year'] == 2016]
players_2016['cluster'] = player_kmeans[6]


players_2017 = nba_players.loc[nba_players['Year'] == 2017]
players_2017['cluster'] = player_kmeans[7]


players_2018 = nba_players.loc[nba_players['Year'] == 2018]
players_2018['cluster'] = player_kmeans[8]


players_2019 = nba_players.loc[nba_players['Year'] == 2019]
players_2019['cluster'] = player_kmeans[9]


players_2020 = nba_players.loc[nba_players['Year'] == 2020]
players_2020['cluster'] = player_kmeans[10]


players_2021 = nba_players.loc[nba_players['Year'] == 2021]
players_2021['cluster'] = player_kmeans[11]


In [16]:
def pca_players(data):
    pca_2 = PCA(15)
    plot_cols = pca_2.fit_transform(data)
    return plot_cols

In [17]:
p_2010 = pca_players(players_2010)
p_2011 = pca_players(players_2011)
p_2012 = pca_players(players_2012)
p_2013 = pca_players(players_2013)
p_2014 = pca_players(players_2014)
p_2015 = pca_players(players_2015)
p_2016 = pca_players(players_2016)
p_2017 = pca_players(players_2017)
p_2018 = pca_players(players_2018)
p_2019 = pca_players(players_2019)
p_2020 = pca_players(players_2020)
p_2021 = pca_players(players_2021)

In [28]:
players_2010['x_var'] = p_2010[:,0]
players_2010['y_var'] = p_2010[:,1]

players_2011['x_var'] = p_2011[:,0]
players_2011['y_var'] = p_2011[:,1]

players_2012['x_cor'] = p_2012[:,0]
players_2012['y_cor'] = p_2012[:,1]

players_2013['x_cor'] = p_2013[:,0]
players_2013['y_cor'] = p_2013[:,1]

players_2014['x_cor'] = p_2014[:,0]
players_2014['y_cor'] = p_2014[:,1]

players_2015['x_cor'] = p_2015[:,0]
players_2015['y_cor'] = p_2015[:,1]

players_2016['x_cor'] = p_2016[:,0]
players_2016['y_cor'] = p_2016[:,1]

players_2017['x_cor'] = p_2017[:,0]
players_2017['y_cor'] = p_2017[:,1]

players_2018['x_cor'] = p_2018[:,0]
players_2018['y_cor'] = p_2018[:,1]

players_2019['x_cor'] = p_2019[:,0]
players_2019['y_cor'] = p_2019[:,1]

players_2020['x_cor'] = p_2020[:,0]
players_2020['y_cor'] = p_2020[:,1]

players_2021['x_cor'] = p_2021[:,0]
players_2021['y_cor'] = p_2021[:,1]

In [29]:
player_df = pd.concat([players_2010,
players_2011,
players_2012,
players_2013,
players_2014,
players_2015,
players_2016,
players_2017,
players_2018,
players_2019,
players_2020,
players_2021])

In [33]:
player_df["Pos"] = player_df["Pos"].str.split("-", expand=True)[0]

In [34]:
player_df['Pos'].unique()

array(['SG', 'PF', 'PG', 'SF', 'C', 'F', 'G'], dtype=object)

In [39]:
player_df.to_csv('NBA_Player_df.csv')

In [40]:
y_test.to_csv('stepwise.csv')
y_tests.to_csv('Handpicked.csv')

# Dash Test

In [5]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
from jupyter_dash import JupyterDash
import pandas as pd
#NBA Players
df = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/NBA_Player_df.csv')
#NBA Teams
tdf = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/NBA_Stats_2010-11_2020-21.csv')
#Stepwise predictions
df1 = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/stepwise.csv')
#Hand Picked Predictions
df2 = pd.read_csv('/Users/kalebmckenzie/Documents/GitHub/Basketball Project/Handpicked.csv')

# Figures
fig1 = px.scatter(df1, x=df1['Preds'], y=df1['W'],title="Step-Wise RMSE(2.618) R^2(0.954)", height=500)

fig2 = px.scatter(df2, x=df2['Preds'], y=df2['W'],title="Hand Picked Stats RMSE(4.359) R^2(0.874)", height=500)

available_indicators = tdf['Team'].unique()

available_indicators2 = df['Player'].unique()

app = JupyterDash(__name__)

markdown_text = '''
### NBA Season Long Win Predictions
Step-Wise Stats Chosen:
# 
    PTS -- Points
    3PA -- 3-Point Field Goal Attempts
    2PA -- 2-point Field Goal Attempts
    NRtg -- Net Rating; an estimate of point differential per 100 possessions.
    PW -- Pythagorean wins, i.e., expected wins based on points scored and allowed
    FG% -- Field Goal Percentage
    ORtg -- Offensive Rating
    DRtg -- Defensive Rating
    ORB -- Offensive Rebounds
    DRB -- Defensive Rebounds
'''
markdown_texts = '''
Hand Picked Stats Chosen:
#
    FTA -- Free Throw Attempts
    3PA -- 3-Point Field Goal Attempts
    2PA -- 2-point Field Goal Attempts
    FGA -- Field Goal Attempts
    ORB% -- Offensive Rebound Percentage
    DRB% -- Defensive Rebound Percentage
    ORtg -- Offensive Rating
    DRtg -- Defensive Rating
    TOV% -- Opponent Turnover Percentage
    STL -- Steals
    BLK -- Blocks
'''
markdown_text2 = '''
### Most Similar NBA Players By Year and Position
'''

markdown_text3 = '''
### Most Similar NBA Players By Year Clustered
'''

app.layout = html.Div([
        html.Div([
            dcc.Dropdown(
                id='team_drop',
                options=[{'label': i, 'value': i} for i in available_indicators],
                value='Boston Celtics'
            )
        ],
        style={'width': '28%', 'display': 'inline-block'}),

    dcc.Graph(id='indicator-graphic'),

        html.Div([
            dcc.Dropdown(
                id='Player_drop2',
                options=[{'label': i, 'value': i} for i in available_indicators],
                value='Boston Celtics'
            )
        ],
        style={'width': '48%', 'display': 'inline-block'}),

    dcc.Graph(id='indicator-graphic2'),


    dcc.Markdown(children=markdown_text3),
    dcc.Graph(id='graph-with-slider'),
    dcc.Slider(
        id='year-slider',
        min=df['Year'].min(),
        max=df['Year'].max(),
        value=df['Year'].min(),
        marks={str(year): str(year) for year in df['Year'].unique()},
        step=None
    ),
    dcc.Markdown(children=markdown_text2),
    dcc.Graph(id='graph2-with-slider'),
    dcc.Slider(
        id='year-slider2',
        min=df['Year'].min(),
        max=df['Year'].max(),
        value=df['Year'].min(),
        marks={str(year): str(year) for year in df['Year'].unique()},
        step=None
    ),

    dcc.Markdown(children=markdown_text),
    dcc.Graph(
        id='step-wise',
        figure=fig1),

    dcc.Markdown(children=markdown_texts),
    dcc.Graph(
        id='hand-wise',
        figure=fig2),
])
####
@app.callback(
    Output('indicator-graphic', 'figure'),
    Input('team_drop', 'value'))

def update_graph(team_drop):
    dff = tdf[tdf['Team'] == team_drop]

    fig = px.scatter(dff, x='Year', y='W/L%',title="Team Wins By Year", height=500)

    return fig

@app.callback(
    Output('indicator-graphic2', 'figure'),
    Input('player_drop', 'value'))

def update_graph(team_drop):
    dff = tdf[tdf['Team'] == team_drop]

    fig = px.scatter(dff, x='Year', y='W/L%',title="Team Wins By Year", height=500)

    return fig

#####
@app.callback(
    Output('graph-with-slider', 'figure'),
    Input('year-slider', 'value'))

def update_figure(selected_year):
    filtered_df = df[df.Year == selected_year]

    fig = px.scatter(filtered_df, x="x_cor", y="y_cor",
                     color="cluster", hover_name=filtered_df.index
                     )

    fig.update_layout(transition_duration=500)

    return fig

@app.callback(
    Output('graph2-with-slider', 'figure'),
    Input('year-slider2', 'value'))

def update_figure2(selected_year):
    filtered_df = df[df.Year == selected_year]

    fig = px.scatter(filtered_df, x="x_cor", y="y_cor",
                     color="Pos", hover_name=filtered_df.index
                     )

    fig.update_layout(transition_duration=500)

    return fig


if __name__ == '__main__':
    app.run_server(debug=True)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8050): Max retries exceeded with url: /_alive_e6749681-209c-40b5-af38-50962b2cb21d (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fba41e73c90>: Failed to establish a new connection: [Errno 61] Connection refused'))