# What makes an NBA All-Star?
### STAT 206 Final Project

Description

In [None]:
import Pkg
Pkg.activate(pwd())
Pkg.instantiate()
Pkg.status()

In [None]:
using CSV, DataFrames
using Plots, StatsPlots
using Distributions
using MLJ, MLJClusteringInterface
using NearestNeighbors, StableRNGs, Random
#import PlotlyJS as PJS#import because PlotlyJS overwrites all plots and statsplots
#using Colors
using MLJLIBSVMInterface
using FreqTables

In [None]:
nbadf = CSV.read("nba_2022_2023.csv", DataFrame, stringtype = String) |> dropmissing
#describe(nbadf)
rename!(nbadf,
Symbol("FG%")=>:FGpct,
Symbol("3P")=>:ThreeP,
Symbol("3PA")=>:ThreePA,
Symbol("3P%")=>:ThreePpct,
Symbol("2P")=>:TwoP,
Symbol("2PA")=>:TwoPA,
Symbol("2P%")=>:TwoPpct,
Symbol("eFG%")=>:eFGpct,
Symbol("FT%")=>:FTpct,
:PTS=>:PPG
)

#Convert multi-positional to their main position
nbadf.Pos = first.(nbadf.Pos,2)
#filter(:Pos => in(["SG-PG","SF-SG","PG-SG","SF-PF","PF-C","PF-SF"]), nbadf)

allstars = CSV.read("nba_2022_2023_allstars.csv", DataFrame, stringtype = String)
leftjoin!(nbadf,allstars, on = :Player => :NAME)
replace!(nbadf.ALLSTAR, missing => "N")
#filter(:ALLSTAR=>==("Y"),nbadf)
nbadf.GSpct = nbadf.GS ./ nbadf.G
describe(nbadf)

Columns Description:

1. Rk: Rank
2. Player: Player's name
3. Pos: Position
4. Age: Player's age
5. Tm: Team
6. G: Games played
7. GSpct: Games started percentage
8. MP: Minutes played per game
9. FG: Field goals per game
10. FGA: Field goal attempts per game
11. FGpct: Field goal percentage
12. ThreeP: 3-point field goals per game
13. ThreePA: 3-point field goal attempts per game
14. ThreePpct: 3-point field goal percentage
15. TwoP: 2-point field goals per game
16. TwoPA: 2-point field goal attempts per game
17. TwoPpct: 2-point field goal percentage
18. eFGpct: Effective field goal percentage
19. FT: Free throws per game
20. FTA: Free throw attempts per game
21. FTpct: Free throw percentage
22. ORB: Offensive rebounds per game
23. DRB: Defensive rebounds per game
24. TRB: Total rebounds per game
25. AST: Assists per game
26. STL: Steals per game
27. BLK: Blocks per game
28. TOV: Turnovers per game
29. PF: Personal fouls per game
30. PPG: Points per game
31. ALLSTAR: All-Star status

In [None]:
numeric_df = select(nbadf, Not([:Player, :Pos, :Tm, :ALLSTAR,:GS]))
vars = names(numeric_df)
X = Matrix(numeric_df)
M = cor(X)
fig = heatmap(M, 
    title="Correlation Matrix - All Predictors", 
    xticks=(1:26, vars), 
    yticks=(1:26, vars), 
    clims=(-1,1), 
    xrot=45,
    color=cgrad(:balance, rev=true), 
    aspect=:ratio, 
    size=(900, 800)
)

for j in axes(M,2), i in axes(M,1)
    annotate!(i,j, text("$(round(M[i,j], digits=2))", :white, 8))
end 
fig 

### Check for multicollinearity
+ Shot attempts: high correlation between made shots and attempts, better to use shots made and shot pct
+ eFGpct: According to [basketball-reference.com](https://www.basketball-reference.com/about/glossary.html), eFGpct = (FG + 0.5 * 3P) / FGA which is a linear equation of predictors
+ Field Goals: Field goals is just 2P + 3P, so it is collinear
+ Total Rebounds: Total rebounds is offensive + defensive rebounds, which is collinear
+ Recommendation: Remove these predictors when creating linear models

In [None]:
vars = names(select(numeric_df, Not([:FG,:FGA,:FGpct,:ThreePA,:TwoPA,:FTA,:TRB,:eFGpct])))
X = Matrix(select(numeric_df, Not([:FG,:FGA,:FGpct,:ThreePA,:TwoPA,:FTA,:TRB,:eFGpct])))
M = cor(X)
fig = heatmap(M, 
    title="Correlation Matrix", 
    xticks=(1:18, vars), 
    yticks=(1:18, vars), 
    clims=(-1,1), 
    xrot=45,
    color=cgrad(:balance, rev=true), 
    aspect=:ratio, 
    size=(900, 800)
)

for j in axes(M,2), i in axes(M,1)
    annotate!(i,j, text("$(round(M[i,j], digits=2))", :white, 8))
end 
fig 

In [None]:
# To make a ColorGradient for Heatmaps: color_palette = cgrad(Sample color scheme = :roma, scale = :log))
@df nbadf scatter(:MP, :PPG, group = :ALLSTAR, 
smooth = true, linewidth = 4,
title = "Playtime efficiency",
xlabel = "Minutes played per game",
ylabel = "Points per game",
label = ["Players" "Allstars"])

In [None]:
filter(row->row.ALLSTAR==("Y") && row.MP<20,nbadf)

In [None]:
groupedhist(nbadf.Age, group=nbadf.ALLSTAR, bar_position = :stack,
title = "Player Age Distribution",
xlabel = "Age",
ylabel = "Count",
label = ["Players" "Allstars"])

In [None]:
@df nbadf scatter(:ThreePA, :TwoP, group = :ALLSTAR)

In [None]:
scatter(nbadf.FTA, nbadf.Pos,group = nbadf.ALLSTAR)


In [None]:
boxplot(nbadf.Age, nbadf.G, xlabel="Age", ylabel="Games Played", label= nothing, 
size=(600, 300))


## PCA Analysis

In [None]:
features = names(numeric_df)
# load the PCA model
PCA = @load PCA pkg = MultivariateStats

# Standarize -> PCA w/ 97.5% variance.
# The mean = 0 option means we do not need to center the data.
model = Pipeline(Standardizer(), PCA(variance_ratio = 0.975))

# Fit the machine.
mach = machine(model, numeric_df) |> MLJ.fit!

# Apply a transformation to the numeric data and convert to Matrix.
Xproj = MLJ.transform(mach, numeric_df) |> Matrix

In [None]:
r = report(mach).pca

In [None]:
y = nbadf[:, :ALLSTAR] |> Vector
scatter(Xproj[:, 1], Xproj[:, 2], Xproj[:, 3],
  group = y,
  title = "NBA Data in PC coordinates",
  xlabel = "PC1",
  ylabel = "PC2",
  zlabel = "PC3",
  label = ["Players" "Allstars"]
)

In [None]:
y = nbadf[:, :ALLSTAR] |> Vector
scatter(Xproj[:, 1], Xproj[:, 2],
  group = y,
  title = "NBA Data in PC coordinates",
  xlabel = "PC1",
  ylabel = "PC2",
  label = ["Players" "Allstars"]
)

In [None]:
include("plotvec.jl")
default(fmt = :png, dpi = 100)


In [None]:
L = r.loadings
loadings_df = hcat(
  DataFrame(feature = features),
  DataFrame(L, :auto)
)

rename!(loadings_df, [:feature, :PC1, :PC2, :PC3])
loadings_df

In [None]:
fig = scatter(Xproj[:, 1], Xproj[:, 2],
  group = y,
  title = "NBA Data in PC coordinates",
  xlabel = "PC1",
  ylabel = "PC2",
  markerstrokewidth = 0,
)
ls = [L[i,1:2] for i in axes(L, 1)]
plotvec!(fig, ls, color = :red)
for (k, loading) in enumerate(ls)
  annotate!(fig, loading[1], loading[2] + sign(loading[2])*0.1, text(features[k], :black, 10))
end
fig

#### Interactive scatterplot from PlotlyJS

In [None]:
# PCA_df = DataFrame(:ALLSTAR=>nbadf[:,:ALLSTAR],:X1=>Xproj[:, 1],:X2=>Xproj[:, 2],:X3=>Xproj[:,3])

# colors = [RGB(18/255, 133/255, 248/255), RGB(217/255, 89/255, 56/255)]
# data = PJS.GenericTrace[]
# df = filter(:ALLSTAR=>==("N"),PCA_df)
# PC1=df[:,:X1]
# PC2=df[:,:X2]
# PC3=df[:,:X3]
# trace = PJS.scatter3d(name="Players", mode="markers",
#                     marker_size=3, marker_color=colors[1], marker_line_width=0,
#                     x=PC1, y=PC2, z=PC3)
# push!(data, trace)
# cluster = PJS.mesh3d(color=colors[1], opacity=0.3, x=PC1, y=PC2, z=PC3)
# push!(data, cluster)

# df = filter(:ALLSTAR=>==("Y"),PCA_df)
# PC1=df[:,:X1]
# PC2=df[:,:X2]
# PC3=df[:,:X3]
# trace = PJS.scatter3d(name="Allstars", mode="markers",
#                     marker_size=3, marker_color=colors[2], marker_line_width=0,
#                     x=PC1, y=PC2, z=PC3)
# push!(data, trace)
# cluster = PJS.mesh3d(color=colors[2], opacity=0.3, x=PC1, y=PC2, z=PC3)
# push!(data, cluster)
# # notice the nested attrs to create complex JSON objects
# layout = PJS.Layout(width=800, height=550, autosize=false, title="Interactive NBA Data in PC coordinates",
#                 scene=PJS.attr(xaxis=PJS.attr(gridcolor="rgb(255, 255, 255)",
#                                         zerolinecolor="rgb(255, 255, 255)",
#                                         showbackground=true,
#                                         backgroundcolor="rgb(230, 230,230)",
#                                         title = "PC1"),
#                             yaxis=PJS.attr(gridcolor="rgb(255, 255, 255)",
#                                         zerolinecolor="rgb(255, 255, 255)",
#                                         showbackground=true,
#                                         backgroundcolor="rgb(230, 230,230)",
#                                         title = "PC2"),
#                             zaxis=PJS.attr(gridcolor="rgb(255, 255, 255)",
#                                         zerolinecolor="rgb(255, 255, 255)",
#                                         showbackground=true,
#                                         backgroundcolor="rgb(230, 230,230)",
#                                         title = "PC3"),
#                             aspectratio=PJS.attr(x=1, y=1, z=0.7),
#                             aspectmode = "manual"),
#                             scene_camera = PJS.attr(eye=PJS.attr(x=1.25, y=-2, z=.75) # Try to match Plots output
#                             ))
# # p = PJS.plot(data, layout)
# # open("./PCA3D.html", "w") do io
# #     PJS.PlotlyBase.to_html(io, p.plot)
# # end

<iframe width=900 height=600 src="./PCA3D.html"></iframe>

# Classification Models

In [None]:
KMeans = @load KMeans pkg = Clustering verbosity=0
rng = StableRNG(206)
standardizer = Standardizer()
kmeans = KMeans(k=2)
pipeline_model = Pipeline(standardizer,kmeans)
machine1 = machine(pipeline_model, numeric_df)
fit!(machine1)
r = report(machine1)
#@show r.k_means


In [None]:
pred_allstars = nbadf[r.k_means.assignments .== 1,:]
pred_players = nbadf[r.k_means.assignments .== 2,:]
@show length(filter(:ALLSTAR=>==("Y"),pred_allstars).ALLSTAR)/length(pred_allstars.ALLSTAR)
length(filter(:ALLSTAR=>==("N"),pred_players).ALLSTAR)/length(pred_players.ALLSTAR)

In [None]:
y = categorical(String.(nbadf[!, :ALLSTAR]), levels = ["N", "Y"])
X = select(numeric_df, Not([:FG,:FGA,:FGpct,:ThreePA,:TwoPA,:FTA,:TRB,:eFGpct]))

train, test = partition(eachindex(y), 0.8, shuffle=true, rng=1234);
acc = Float64[] # accuracy()
pre = Float64[] # multiclass_precision()
rec = Float64[] # multiclass_recall()
f1s = Float64[] # f1score()
mat = []        # confusion_matrix()

In [None]:
KNNClassifier = @load KNNClassifier verbosity = 0
LDA = @load LDA verbosity = 0
NeuralNetworkClassifier = @load NeuralNetworkClassifier pkg = MLJFlux verbosity = 0
MultinomialClassifier = @load MultinomialClassifier verbosity = 0
DecisionTreeClassifier = @load DecisionTreeClassifier pkg = DecisionTree verbosity=0
RandomForestClassifier = @load RandomForestClassifier pkg=DecisionTree verbosity=0
LogisticClassifier = @load LogisticClassifier pkg= MLJLinearModels verbosity=0
XGBoostClassifier = @load XGBoostClassifier pkg = XGBoost verbosity=0
model_list = [
    KNNClassifier(K = 5), # use nearest 5-neighbors to make predictions
    LDA(),
    NeuralNetworkClassifier(epochs = 50),
    MultinomialClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticClassifier(),
    XGBoostClassifier()
]

In [None]:
for clf in model_list
    Random.seed!(206)
    # Create a pipeline model that standardizes, then fits a classifier.
    model = Pipeline(Standardizer(), clf)
    # Fit the model onto the training set
    mach = machine(model, X, y)
    fit!(mach, rows = train, verbosity = 0)
    # Make predictions on the test set
    yhat = MLJ.predict(mach, rows = test)
    # Evaluate the model on the test set using selected metrics
    #
    # NOTES:
    #
    #   - MLJ.predict() may give probabilistic predictions. Use mode() to collapse to a concrete target.
    #   - An evaluation metric F() always accepts inputs as F(fitted, observed).
    #
    push!(acc, accuracy(mode.(yhat), y[test]))
    push!(pre, multiclass_precision(mode.(yhat), y[test]))
    push!(rec, multiclass_recall(mode.(yhat), y[test]))
    push!(f1s, f1score(mode.(yhat), y[test]))
    push!(mat, ConfusionMatrix(levels = levels(y))(mode.(yhat), y[test]))
end

In [None]:
results = DataFrame(
    Model = typeof.(model_list),
    Accuracy = acc,
    Precision = pre,
    Recall = rec,
    F1 = f1s
)

In [None]:
mat[1]  # KNNClassifier

In [None]:
mat[2]  # LDA

In [None]:
mat[3]  # NeuralNetworkClassifier

In [None]:
mat[4]  # MultinomialClassifier

In [None]:
mat[5] #Decision Tree

In [None]:
mat[6] #Ridge

In [None]:
mat[7] #Logistic

In [None]:
mat[8] #XGBoost

In [None]:
#Understand most important predictors
import XGBoost
y_encode = replace(y,"Y"=>1,"N"=>0)
b = XGBoost.xgboost((X,y_encode))
pred = [row>0.5 for row in XGBoost.predict(b,X)]
y2= replace(nbadf.ALLSTAR,"Y"=>1,"N"=>0)#test
1-mean(abs.(pred.-y2))#Accuracy

d = (XGBoost.importance(b))

In [None]:
model = Pipeline(Standardizer(), LogisticClassifier())
    # Fit the model onto the training set
    mach = machine(model, X, y_encode)
    fit!(mach, verbosity = 0)
fitted_params(mach).logistic_classifier.coefs |> DataFrame
#MLJ.predict(mach)