# NBA AI Exploratory Data Analysis

### Imports and Global Settings

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
sns.set_style("whitegrid")
sns.set_context("notebook")

### Loading Data

In [2]:
df = pd.read_csv("../data/nba_ai/nba_data.csv")

In [3]:
df.head()

Unnamed: 0,bigdataball_dataset,game_id,date,home_team,home_1q,home_2q,home_3q,home_4q,home_ot1,home_ot2,home_ot3,home_ot4,home_ot5,home_f,home_min,home_fg,home_fga,home_3p,home_3pa,home_ft,home_fta,home_or,home_dr,home_tot,home_a,home_pf,home_st,home_to,home_to_to,home_bl,home_pts,home_poss,home_pace,home_oeff,home_deff,home_team_rest_days,home_opening_spread,opening_total,home_closing_spread,closing_total,home_moneyline,home_starting_lineup,road_team,road_1q,road_2q,road_3q,road_4q,road_ot1,road_ot2,road_ot3,road_ot4,road_ot5,road_f,road_min,road_fg,road_fga,road_3p,road_3pa,road_ft,road_fta,road_or,road_dr,road_tot,road_a,road_pf,road_st,road_to,road_to_to,road_bl,road_pts,road_poss,road_pace,road_oeff,road_deff,road_team_rest_days,road_moneyline,road_starting_lineup,main_ref,crew,day_of_season,home_team_game_num,road_team_game_num,home_eFG%,home_TOV%,home_ORB%,home_FT%,road_eFG%,road_TOV%,road_ORB%,road_FT%,winner,loser,home_wins,home_losses,home_win_pct,road_wins,road_losses,road_win_pct,home_wins_l2w,home_losses_l2w,home_win_pct_l2w,road_wins_l2w,road_losses_l2w,road_win_pct_l2w,home_avg_1q,road_avg_1q,home_avg_1q_l2w,road_avg_1q_l2w,home_avg_2q,road_avg_2q,home_avg_2q_l2w,road_avg_2q_l2w,home_avg_3q,road_avg_3q,home_avg_3q_l2w,road_avg_3q_l2w,home_avg_4q,road_avg_4q,home_avg_4q_l2w,road_avg_4q_l2w,home_avg_ot1,road_avg_ot1,home_avg_ot1_l2w,road_avg_ot1_l2w,home_avg_ot2,road_avg_ot2,home_avg_ot2_l2w,road_avg_ot2_l2w,home_avg_ot3,road_avg_ot3,home_avg_ot3_l2w,road_avg_ot3_l2w,home_avg_ot4,road_avg_ot4,home_avg_ot4_l2w,road_avg_ot4_l2w,home_avg_ot5,road_avg_ot5,home_avg_ot5_l2w,road_avg_ot5_l2w,home_avg_f,road_avg_f,home_avg_f_l2w,road_avg_f_l2w,home_avg_min,road_avg_min,home_avg_min_l2w,road_avg_min_l2w,home_avg_fg,road_avg_fg,home_avg_fg_l2w,road_avg_fg_l2w,home_avg_fga,road_avg_fga,home_avg_fga_l2w,road_avg_fga_l2w,home_avg_3p,road_avg_3p,home_avg_3p_l2w,road_avg_3p_l2w,home_avg_3pa,road_avg_3pa,home_avg_3pa_l2w,road_avg_3pa_l2w,home_avg_ft,road_avg_ft,home_avg_ft_l2w,road_avg_ft_l2w,home_avg_fta,road_avg_fta,home_avg_fta_l2w,road_avg_fta_l2w,home_avg_or,road_avg_or,home_avg_or_l2w,road_avg_or_l2w,home_avg_dr,road_avg_dr,home_avg_dr_l2w,road_avg_dr_l2w,home_avg_tot,road_avg_tot,home_avg_tot_l2w,road_avg_tot_l2w,home_avg_a,road_avg_a,home_avg_a_l2w,road_avg_a_l2w,home_avg_pf,road_avg_pf,home_avg_pf_l2w,road_avg_pf_l2w,home_avg_st,road_avg_st,home_avg_st_l2w,road_avg_st_l2w,home_avg_to,road_avg_to,home_avg_to_l2w,road_avg_to_l2w,home_avg_to_to,road_avg_to_to,home_avg_to_to_l2w,road_avg_to_to_l2w,home_avg_bl,road_avg_bl,home_avg_bl_l2w,road_avg_bl_l2w,home_avg_pts,road_avg_pts,home_avg_pts_l2w,road_avg_pts_l2w,home_avg_poss,road_avg_poss,home_avg_poss_l2w,road_avg_poss_l2w,home_avg_pace,road_avg_pace,home_avg_pace_l2w,road_avg_pace_l2w,home_avg_oeff,road_avg_oeff,home_avg_oeff_l2w,road_avg_oeff_l2w,home_avg_deff,road_avg_deff,home_avg_deff_l2w,road_avg_deff_l2w,home_avg_eFG%,road_avg_eFG%,home_avg_eFG%_l2w,road_avg_eFG%_l2w,home_avg_TOV%,road_avg_TOV%,home_avg_TOV%_l2w,road_avg_TOV%_l2w,home_avg_ORB%,road_avg_ORB%,home_avg_ORB%_l2w,road_avg_ORB%_l2w,home_avg_FT%,road_avg_FT%,home_avg_FT%_l2w,road_avg_FT%_l2w,home_avg_pts_allowed,road_avg_pts_allowed,home_avg_pts_allowed_l2w,road_avg_pts_allowed_l2w,REG_TARGET,CLS_TARGET,CLS_TARGET_closing_spread,REG_TARGET_OU,CLS_TARGET_OU_OPEN,CLS_TARGET_OU_CLOSE
0,NBA 2021-2022 Regular Season,22100001,2021-10-19,Milwaukee,37,29,31,30,,,,,,127,240,48,105,17,45,14,18,13,41,54,25,19,8,7,8,9,127,102.843098,102.843098,123.489085,101.12492,3+,-1.5,240.5,-2.0,234.0,-126,"Khris Middleton,Giannis Antetokounmpo,Brook Lo...",Brooklyn,25,34,26,19,,,,,,104,240,37,84,17,32,13,23,5,39,44,19,17,3,12,13,9,104,102.843098,102.843098,101.12492,123.489085,3+,105,"Kevin Durant,Blake Griffin,Nic Claxton,Joe Har...",Josh Tiven,"Jacyn Goble,Natalie Sago",1,1,1,0.538095,0.066159,0.245283,0.133333,0.541667,0.121359,0.108696,0.154762,Milwaukee,Brooklyn,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,True,True,231,False,False
1,NBA 2021-2022 Regular Season,22100002,2021-10-19,LA Lakers,34,25,26,29,,,,,,114,240,45,95,15,42,9,19,5,40,45,21,25,7,17,18,4,114,113.282595,113.282595,100.633288,106.812525,3+,-5.5,230.5,-3.0,226.5,-154,"LeBron James,Anthony Davis,DeAndre Jordan,Kent...",Golden State,32,21,30,38,,,,,,121,240,41,93,14,39,25,30,9,41,50,30,18,9,17,17,2,121,113.282595,113.282595,106.812525,100.633288,3+,130,"Andrew Wiggins,Draymond Green,Kevon Looney,Jor...",Sean Wright,"Mark Lindsay,Ray Acosta",1,1,1,0.552632,0.148319,0.111111,0.094737,0.516129,0.137987,0.2,0.268817,Golden State,LA Lakers,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7,False,False,235,True,True
2,NBA 2021-2022 Regular Season,22100013,2021-10-20,Portland,23,26,36,36,,,,,,121,240,45,93,12,35,19,22,9,40,49,25,22,5,12,13,5,121,105.079957,105.079957,115.150408,118.005377,3+,-5.5,231.5,-6.5,234.0,-255,"Norman Powell,Robert Covington,Jusuf Nurkic,CJ...",Sacramento,24,38,38,24,,,,,,124,240,42,92,17,41,23,29,7,36,43,24,22,6,10,10,4,124,105.079957,105.079957,118.005377,115.150408,3+,208,"Harrison Barnes,Maurice Harkless,Richaun Holme...",Sean Wright,"Nick Buchert,Phenizee Ransom",2,1,1,0.548387,0.112379,0.209302,0.204301,0.548913,0.087138,0.148936,0.25,Sacramento,Portland,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3,False,False,245,True,True
3,NBA 2021-2022 Regular Season,22100012,2021-10-20,Phoenix,20,38,24,16,,,,,,98,240,36,87,14,37,12,17,11,34,45,23,18,9,18,18,3,98,99.68525,99.68525,98.309429,110.347318,3+,-6.5,224.5,-6.0,224.0,-240,"Mikal Bridges,Jae Crowder,Deandre Ayton,Devin ...",Denver,26,25,34,25,,,,,,110,240,44,83,17,39,5,9,6,40,46,25,20,9,17,19,1,110,99.68525,99.68525,110.347318,98.309429,3+,190,"Michael Porter Jr.,Aaron Gordon,Nikola Jokic,W...",Leon Wood,"Kevin Cutler,Marc Davis",2,1,1,0.494253,0.160028,0.211538,0.137931,0.63253,0.179313,0.146341,0.060241,Denver,Phoenix,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12,False,False,208,False,False
4,NBA 2021-2022 Regular Season,22100011,2021-10-20,Utah,27,27,29,24,,,,,,107,240,40,91,14,47,13,15,12,41,53,18,19,6,10,10,5,107,94.965313,94.965313,112.672718,90.559381,3+,-11.5,222.0,-13.5,221.5,-1428,"Bojan Bogdanovic,Royce O'Neale,Rudy Gobert,Don...",Oklahoma City,18,24,21,23,,,,,,86,240,34,91,7,35,11,18,15,35,50,19,15,4,14,15,2,86,94.965313,94.965313,90.559381,112.672718,3+,790,"Luguentz Dort,Darius Bazley,Derrick Favors,Jos...",Zach Zarba,"Mark Lindsay,Ray Acosta",2,1,1,0.516484,0.092937,0.26087,0.142857,0.412088,0.131671,0.267857,0.120879,Utah,Oklahoma City,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,True,True,193,False,False


## Feature Selection

In [4]:
def feature_selection_analysis(df, cls_target, reg_target, feature_cols):
    """
    Perform feature selection analysis on a given dataframe.

    Parameters:
    df (DataFrame): The dataframe containing the features and target variables.
    cls_target (str): The name of the classification target column.
    reg_target (str): The name of the regression target column.
    feature_cols (list): A list of column names representing the features to be evaluated.

    Returns:
    DataFrame: A dataframe with features ranked based on different metrics.

    The function calculates the following metrics for feature selection:
    - Point-biserial Correlation: Measures the correlation between a binary classification target and continuous features.
    - Pearson Correlation: Measures the linear correlation between a continuous regression target and continuous features.
    - Mutual Information (Classification/Regression): Quantifies the amount of information obtained about one random variable through observing the other random variable.
    - Random Forest Feature Importance: The importance of features as determined by a Random Forest Classifier/Regressor.
    """

    # Extract the features for analysis
    features = [col for col in feature_cols]

    # Calculate point-biserial correlation for classification
    cls_correlations = df[features].corrwith(df[cls_target]).abs()

    # Calculate Pearson correlation for regression
    reg_correlations = df[features].corrwith(df[reg_target]).abs()

    # Compute mutual information for classification and regression
    mi_classif = mutual_info_classif(df[features], df[cls_target])
    mi_reg = mutual_info_regression(df[features], df[reg_target])

    # Feature importance using RandomForest for classification and regression
    rf_classif = RandomForestClassifier(n_estimators=50, random_state=1)
    rf_classif.fit(df[features], df[cls_target])
    classif_importance = rf_classif.feature_importances_

    rf_reg = RandomForestRegressor(n_estimators=50, random_state=1)
    rf_reg.fit(df[features], df[reg_target])
    reg_importance = rf_reg.feature_importances_

    # Combine results into a DataFrame
    feature_selection_df = pd.DataFrame(
        {
            "Feature": features,
            "Point-Biserial Correlation (Classification)": cls_correlations.values,
            "Pearson Correlation (Regression)": reg_correlations.values,
            "Mutual Information (Classification)": mi_classif,
            "Mutual Information (Regression)": mi_reg,
            "Random Forest Feature Importance (Classification)": classif_importance,
            "Random Forest Feature Importance (Regression)": reg_importance,
        }
    )

    # Round values to 2 decimals
    for col in feature_selection_df.columns[1:]:  # Exclude 'Feature' column
        feature_selection_df[col] = feature_selection_df[col].round(2)

    # Rank features for each metric, using 'min' method for tie-breaking
    for metric in feature_selection_df.columns[1:]:
        rank_col = f"Rank - {metric}"
        feature_selection_df[rank_col] = feature_selection_df[metric].rank(
            ascending=False, method="min"
        )

    # Calculate the average rank
    rank_columns = [
        col for col in feature_selection_df.columns if col.startswith("Rank")
    ]
    feature_selection_df["Average Rank"] = (
        feature_selection_df[rank_columns].mean(axis=1).round(2)
    )

    # Sort by average rank
    feature_selection_df.sort_values("Average Rank", inplace=True)

    return feature_selection_df

In [5]:
betting_feature_set = [
    "home_opening_spread",
    "road_opening_spread",
    "opening_total",
    "home_closing_spread",
    "road_closing_spread",
    "closing_total",
    "home_moneyline",
    "road_moneyline",
]

base_feature_set = [
    "day_of_season",
    "home_win_pct",
    "road_win_pct",
    "home_win_pct_l2w",
    "road_win_pct_l2w",
    "home_avg_pts",
    "road_avg_pts",
    "home_avg_pts_l2w",
    "road_avg_pts_l2w",
    "home_avg_oeff",
    "road_avg_oeff",
    "home_avg_oeff_l2w",
    "road_avg_oeff_l2w",
    "home_avg_deff",
    "road_avg_deff",
    "home_avg_deff_l2w",
    "road_avg_deff_l2w",
    "home_avg_eFG%",
    "road_avg_eFG%",
    "home_avg_eFG%_l2w",
    "road_avg_eFG%_l2w",
    "home_avg_TOV%",
    "road_avg_TOV%",
    "home_avg_TOV%_l2w",
    "road_avg_TOV%_l2w",
    "home_avg_ORB%",
    "road_avg_ORB%",
    "home_avg_ORB%_l2w",
    "road_avg_ORB%_l2w",
    "home_avg_FT%",
    "road_avg_FT%",
    "home_avg_FT%_l2w",
    "road_avg_FT%_l2w",
    "home_avg_pts_allowed",
    "road_avg_pts_allowed",
    "home_avg_pts_allowed_l2w",
    "road_avg_pts_allowed_l2w",
]

features_to_prepare = [
    "home_team",
    "road_team",
    "home_team_rest_days",
    "road_team_rest_days",
    "home_team_starting_lineup",
    "road_team_starting_lineup",
]

In [6]:
features_to_evaluate = base_feature_set

feature_selection_analysis_df = feature_selection_analysis(
    df, "CLS_TARGET", "REG_TARGET", features_to_evaluate
)

In [7]:
feature_selection_analysis_df

Unnamed: 0,Feature,Point-Biserial Correlation (Classification),Pearson Correlation (Regression),Mutual Information (Classification),Mutual Information (Regression),Random Forest Feature Importance (Classification),Random Forest Feature Importance (Regression),Rank - Point-Biserial Correlation (Classification),Rank - Pearson Correlation (Regression),Rank - Mutual Information (Classification),Rank - Mutual Information (Regression),Rank - Random Forest Feature Importance (Classification),Rank - Random Forest Feature Importance (Regression),Average Rank
33,home_avg_pts_allowed,0.03,0.1,0.02,0.02,0.03,0.03,6.0,5.0,1.0,13.0,1.0,5.0,5.17
13,home_avg_deff,0.02,0.09,0.01,0.03,0.03,0.04,11.0,7.0,4.0,7.0,1.0,3.0,5.5
35,home_avg_pts_allowed_l2w,0.02,0.1,0.01,0.04,0.03,0.02,11.0,5.0,4.0,3.0,1.0,20.0,7.33
26,road_avg_ORB%,0.03,0.02,0.01,0.05,0.03,0.03,6.0,27.0,4.0,2.0,1.0,5.0,7.5
21,home_avg_TOV%,0.02,0.06,0.02,0.0,0.03,0.03,11.0,9.0,1.0,29.0,1.0,5.0,9.33
15,home_avg_deff_l2w,0.02,0.08,0.01,0.02,0.03,0.02,11.0,8.0,4.0,13.0,1.0,20.0,9.5
31,home_avg_FT%_l2w,0.04,0.02,0.0,0.03,0.03,0.03,4.0,27.0,15.0,7.0,1.0,5.0,9.83
28,road_avg_ORB%_l2w,0.05,0.03,0.0,0.02,0.03,0.03,2.0,23.0,15.0,13.0,1.0,5.0,9.83
8,road_avg_pts_l2w,0.01,0.05,0.02,0.03,0.03,0.02,23.0,10.0,1.0,7.0,1.0,20.0,10.33
25,home_avg_ORB%,0.05,0.01,0.0,0.03,0.03,0.03,2.0,34.0,15.0,7.0,1.0,5.0,10.67
