# 1. Data processing
- Collect data
- Input Missing values

In [6]:
import pandas as pd
import os
import plotly.graph_objects as go

path_data = os.path.join("data","nba_logreg.csv")
df = pd.read_csv(path_data)
mask = (df["3P%"].isnull()) | (df["3PA"] == 0)  # player without attempted 3P shoot get 3P% = 0%
df.loc[mask, '3P%'] = df.loc[mask].apply(
    lambda row: (row['3P Made'] / row['3PA'] * 100) if row['3PA'] > 0 else 0, axis=1
) # 3PA is already contains info about whether player has attempted 3P shoot or not, no need for additional imputation flag col

# 2. EDA
- Check value coherency
- Box Plot
- Feature correlation
- Histogram

In [7]:
# Box plot & Outliers

from plotly.subplots import make_subplots

num_cols = len(df.select_dtypes(include=['float64', 'int64']).columns) # select numerical columns only
cols_per_row = 2
rows = (num_cols + cols_per_row - 1) // cols_per_row

fig = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=df.select_dtypes(include=['float64', 'int64']).columns)

for i, col in enumerate(df.select_dtypes(include=['float64', 'int64']).columns):
    row = i // cols_per_row + 1
    col_pos = i % cols_per_row + 1
    fig.add_trace(go.Box(y=df[col], name=col, showlegend=False), row=row, col=col_pos)

fig.update_layout(height=300*rows, width=400*cols_per_row, title_text="Box Plots of Numeric Features")
fig.show()

Some outliers but values coherent when comparing with common NBA rookies' vals

In [10]:
# Feature correlation

corr_matrix = round(df.corr(numeric_only=True),4)
corr_matrix

import plotly.express as px

fig_corr = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu',
    title="Feature Correlation Matrix"
)
fig_corr.update_layout(width=1000, height=1000)
fig_corr.show()