In [12]:
# Everything needed in order to run this notebook
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

pd.set_option('display.max_columns', 25)
%matplotlib inline
sns.set_theme(color_codes=True)


# Exploratory Data Analysis (EDA)

First we want to take a look at the data provided in the dataset. We want to first get an idea of what our dataset contain. Following that we will get statistcal information on the numerical features and create visualizations.

### Reading the CSV file

In [13]:
nba_stats = pd.read_csv('/home/kylep/cs3120/project/NBA-EDA-and-Model-Tuning/data/database_24_25.csv')
print(f"Our dataset has {nba_stats.shape[0]} rows and {nba_stats.shape[1]} columns.")
nba_stats.head()

Our dataset has 16512 rows and 25 columns.


Unnamed: 0,Player,Tm,Opp,Res,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,Data
0,Jayson Tatum,BOS,NYK,W,30.3,14,18,0.778,8,11,0.727,1,2,0.5,0,4,4,10,1,1,1,1,37,38.1,2024-10-22
1,Anthony Davis,LAL,MIN,W,37.58,11,23,0.478,1,3,0.333,13,15,0.867,3,13,16,4,1,3,1,1,36,34.0,2024-10-22
2,Derrick White,BOS,NYK,W,26.63,8,13,0.615,6,10,0.6,2,2,1.0,0,3,3,4,1,0,0,1,24,22.4,2024-10-22
3,Jrue Holiday,BOS,NYK,W,30.52,7,9,0.778,4,6,0.667,0,0,0.0,2,2,4,4,1,0,0,2,18,19.5,2024-10-22
4,Miles McBride,NYK,BOS,L,25.85,8,10,0.8,4,5,0.8,2,3,0.667,0,0,0,2,0,0,1,1,22,17.8,2024-10-22


We will now want to take a look at the data types we have.

In [14]:
nba_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  16512 non-null  object 
 1   Tm      16512 non-null  object 
 2   Opp     16512 non-null  object 
 3   Res     16512 non-null  object 
 4   MP      16512 non-null  float64
 5   FG      16512 non-null  int64  
 6   FGA     16512 non-null  int64  
 7   FG%     16512 non-null  float64
 8   3P      16512 non-null  int64  
 9   3PA     16512 non-null  int64  
 10  3P%     16512 non-null  float64
 11  FT      16512 non-null  int64  
 12  FTA     16512 non-null  int64  
 13  FT%     16512 non-null  float64
 14  ORB     16512 non-null  int64  
 15  DRB     16512 non-null  int64  
 16  TRB     16512 non-null  int64  
 17  AST     16512 non-null  int64  
 18  STL     16512 non-null  int64  
 19  BLK     16512 non-null  int64  
 20  TOV     16512 non-null  int64  
 21  PF      16512 non-null  int64  
 22

Next, we are going to check if the dataset has any null entries.

In [15]:
null_count_per_column = nba_stats.isnull().sum()
print (null_count_per_column)

Player    0
Tm        0
Opp       0
Res       0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
GmSc      0
Data      0
dtype: int64


As we can see we have no missing entries. Next, let's list out all the data that contains an int or float. We will then get the statisical information from that data.

In [16]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = nba_stats.select_dtypes(include=numerics).copy()
numeric_df.describe()

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,22.405259,3.848958,8.271742,0.427527,1.250061,3.477955,0.263719,1.572917,2.016897,0.417737,1.027253,3.071827,4.099079,2.453852,0.769985,0.468084,1.273861,1.749152,10.520894,8.575109
std,10.896645,3.255864,6.071434,0.254938,1.523651,3.151726,0.286271,2.233374,2.688863,0.435771,1.371552,2.717525,3.478652,2.613505,0.991039,0.833377,1.428819,1.436416,8.829493,7.821577
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.6
25%,14.4275,1.0,4.0,0.286,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,2.4
50%,23.43,3.0,7.0,0.444,1.0,3.0,0.25,1.0,1.0,0.333,1.0,2.0,3.0,2.0,0.0,0.0,1.0,2.0,9.0,7.1
75%,31.3225,6.0,12.0,0.571,2.0,5.0,0.5,2.0,3.0,1.0,2.0,4.0,6.0,4.0,1.0,1.0,2.0,3.0,16.0,13.3
max,50.48,22.0,38.0,1.0,10.0,20.0,1.0,18.0,26.0,1.0,12.0,23.0,28.0,22.0,8.0,10.0,11.0,6.0,60.0,54.2
