# 1. Data processing
- Collect data
- Input Missing values
- Remove duplicates
- Check value coherency

In [1]:
import pandas as pd
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Utils
from tools.preprocessing import handle_missing_vals, cap_outliers, handle_duplicates

# Visualization
from plotly.subplots import make_subplots

path_data = os.path.join("data","nba_dataset.csv")
df = pd.read_csv(path_data)
df = df.drop(columns="Name") # irrelevant column
df = handle_missing_vals(df)

Handling missing values ...


In [2]:
# Duplicates
target = "TARGET_5Yrs"
df = handle_duplicates(df,target)

Removing 12 duplicates
Removing 40 quasi-duplicates


We observe some duplicates and quasi-duplicates (rows with the exact same feature but a different target). Assuming we have no access to the original data, I decide to drop them. <br>
We lose around 4% of the dataset doing so but data is now way cleaner !

# 2. EDA
- Feature correlation
- Outliers
- Histogram

In [3]:
# Feature correlation
numeric_cols = df.select_dtypes(include=[int,float]).columns
corr_matrix = df[numeric_cols].corr().round(4)

fig = go.Figure(
    data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.index,
        colorscale='RdBu',
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Correlation"),
        text=np.round(corr_matrix.values, 2),
        texttemplate="%{text}"
    )
)

fig.update_layout(
    title="Feature Correlation Matrix (from scratch)",
    width=1000,
    height=1000
)
fig.show()

Lots of highly correlated features, but looking at their definition we can identify some linear dependencies / redundancies we want to get rid off.

In [4]:
# Handling high correlation
diffs = {
    "REB - (OREB + DREB)": df["REB"] - (df["OREB"] + df["DREB"]),
    "3P Made - (3P% * 3PA / 100)": df["3P Made"] - round(df["3P%"] * df["3PA"] / 100, 1),
    "FGM - (FG% * FGA / 100)": df["FGM"] - round(df["FG%"] * df["FGA"] / 100, 1),
    "FTM - (FT% * FTA / 100)": df["FTM"] - round(df["FT%"] * df["FTA"] / 100, 1)
}

fig_diff = make_subplots(rows=2, cols=2, subplot_titles=list(diffs.keys()))

for idx, (title, diff_series) in enumerate(diffs.items()):
    r = idx // 2 + 1
    c = idx % 2 + 1
    fig_diff.add_trace(
        go.Histogram(x=diff_series, nbinsx=30, name=title, showlegend=False),
        row=r, col=c
    )

fig_diff.update_layout(height=600, width=800, title_text="Distribution of Feature Differences")
fig_diff.show()

Some redundancy introduced in this dataset : <br> 
1) $OREB+DREB\approx REB$ (approximate due to rounding errors)
2) $3P\%\approx\frac{3P Made}{3PA}$
3) $FP\%\approx\frac{FPM}{FPA}$
4) $FT\%\approx\frac{FTM}{FTA}$

Plots and relations above help us checking value are coherent (OREB <= REB, 3P Made <= 3PA ...). <br>
Looking at distribs we decide to remove FGM, 3P Made, FTM and DREB.

In [5]:
# Filtering redundancy
col_to_drop = ["FGM","3P Made","FTM","DREB"]
df = df.drop(columns=col_to_drop)

# Feature correlation (V2)

corr_matrix = round(df.corr(numeric_only=True),4)
corr_matrix

import plotly.express as px

fig_corr = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu',
    title="Feature Correlation Matrix"
)
fig_corr.update_layout(width=1000, height=1000)
fig_corr.show()

Other correlated feature are less easy to interpret : For example the longer a player stays on the field, the more he would score points. <br>
However it is possible to conceive a player staying a lot on the field without scoring (may help with assists, blocks ...) : I decide to keep them as they are.

In [6]:
# Saving results for models non sensitive to outliers
df.to_csv("data/nba_filtered.csv", index=False)

In [7]:
# Box plot & Outliers
from plotly.subplots import make_subplots

num_cols = len(df.select_dtypes(include=[int, float]).columns) # select numerical columns only
cols_per_row = 2
rows = (num_cols + cols_per_row - 1) // cols_per_row

fig = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=df.select_dtypes(include=[int, float]).columns)

for i, col in enumerate(df.select_dtypes(include=[int, float]).columns):
    row = i // cols_per_row + 1
    col_pos = i % cols_per_row + 1
    fig.add_trace(go.Box(y=df[col], name=col, showlegend=False), row=row, col=col_pos)

fig.update_layout(height=300*rows, width=400*cols_per_row, title_text="Box Plots of Numeric Features")
fig.show()

Box plot first help identifying incoherent values (ex. MIN <0, FG% > 100 ...)<br>
Some outliers are present : I decide to cap them and keeping track with a label. Given the skewed distribution (plots a few cells below) **InterQuantile Range** capping seems a good approach.

In [8]:
# IQR capping
target = "TARGET_5Yrs"
df = cap_outliers(df,target)

# Box plot
num_cols = len(df.select_dtypes(include=[int, float]).columns) # select numerical columns only
cols_per_row = 2
rows = (num_cols + cols_per_row - 1) // cols_per_row

fig = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=df.select_dtypes(include=[int, float]).columns)

for i, col in enumerate(df.select_dtypes(include=[int, float]).columns):
    row = i // cols_per_row + 1
    col_pos = i % cols_per_row + 1
    fig.add_trace(go.Box(y=df[col], name=col, showlegend=False), row=row, col=col_pos)

fig.update_layout(height=300*rows, width=400*cols_per_row, title_text="Box Plots of Numeric Features")
fig.show()

After validating manually feature coherencies for this dataset, model in production should always check coherency among input features

In [9]:
# Saving second version of the dataset for models sensitive to outliers
df.to_csv("data/nba_filtered_capped.csv",index=False)

In [10]:
# Plot feature distributions using subplots
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
cols_per_row = 2
rows = (len(numeric_cols) + cols_per_row - 1) // cols_per_row

fig_dist = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=numeric_cols)

for i, col in enumerate(numeric_cols):
    row = i // cols_per_row + 1
    col_pos = i % cols_per_row + 1
    fig_dist.add_trace(
        go.Histogram(x=df[col], name=col, showlegend=False),
        row=row, col=col_pos
    )

fig_dist.update_layout(height=300*rows, width=400*cols_per_row, title_text="Feature Distributions")
fig_dist.show()

We observe Skewed distributions : Robust scaler is a good approach to scale data.<br>
Classes are imbalanced : dummy classifier can be a good baseline to compare with + need to balance models's weights according to target class proportions.

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_scaled = scaler.fit_transform(df.values)

(1288, 31)

In [20]:
n_cols = len(df.columns)
cols_per_row = 2
rows = (n_cols + cols_per_row - 1) // cols_per_row

fig_dist = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=numeric_cols)

for i, col in enumerate(df.columns):
    row = i // cols_per_row + 1
    col_pos = i % cols_per_row + 1
    fig_dist.add_trace(
        go.Histogram(x=X_scaled[:,i], name=col, showlegend=False),
        row=row, col=col_pos
    )

fig_dist.update_layout(height=300*rows, width=400*cols_per_row, title_text="Scaled Feature Distributions")
fig_dist.show()