In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from numpy.linalg import LinAlgError
pd.set_option('display.max_columns', None)

In [1]:
import pandas as pd
import statsmodels.api as sm

# The factors we want to test
factors = ['diff_orb', 'diff_tov', 'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa']

# Read in the cleansed data
df = pd.read_csv('/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/nba_games_cleansed.csv')

# Filter down to home games and drop rows missing the columns of interest
cols_to_select = [
    'home', 'season', 'won', 'diff_pts', 'diff_orb', 'diff_tov', 
    'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa'
]
df = (
    df[df["home"] == 1]                # Keep only home games
    .reset_index(drop=True)[cols_to_select]
    .dropna(subset=cols_to_select)     # Drop rows with NaNs in the relevant columns
)

# Group the cleansed data by season
grouped = df.groupby('season')

pseudo_r_squared = {}

for season_name, group in grouped:
    # Make sure the group has no missing data for the factors and 'won'
    group = group.dropna(subset=factors + ['won'])
    
    pseudo_r_squared[season_name] = {}
    
    # Exclude each factor one at a time
    for excluded_factor in factors:
        # Create a list of factors that excludes the current factor
        factors_subset = [f for f in factors if f != excluded_factor]
        
        X = group[factors_subset]
        y = group['won']
        
        # Add a constant term for the logistic model
        X = sm.add_constant(X)
        
        # Fit the logistic regression
        model = sm.Logit(y, X)
        result = model.fit(disp=0)  # disp=0 prevents printing iteration info, but you can remove it if you want
        
        # Print the model summary (optional)
        print(result.summary(title=f'Season {season_name} - Excluding {excluded_factor}'))
        
        # Store the pseudo R-squared for this model
        pseudo_r_squared[season_name][excluded_factor] = result.prsquared

# Convert the dictionary of results to a DataFrame
pseudo_r_squared_df = pd.DataFrame(pseudo_r_squared)
pseudo_r_squared_df.to_csv('pseudo_r_squared_cleansed.csv', index=True)

# Also display or inspect the DataFrame in Python
print(pseudo_r_squared_df)

                       Season 2016 - Excluding diff_orb                       
Dep. Variable:                    won   No. Observations:                 1316
Model:                          Logit   Df Residuals:                     1309
Method:                           MLE   Df Model:                            6
Date:                Thu, 06 Mar 2025   Pseudo R-squ.:                  0.6160
Time:                        15:11:32   Log-Likelihood:                -341.22
converged:                       True   LL-Null:                       -888.67
Covariance Type:            nonrobust   LLR p-value:                2.625e-233
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2335      0.099      2.355      0.018       0.039       0.428
diff_tov      -0.2630      0.023    -11.196      0.000      -0.309      -0.217
diff_3p%       0.1855      0.012     15.861      0.0

In [3]:
import pandas as pd
import statsmodels.api as sm

# The factors we want to test
factors = ['diff_orb', 'diff_tov', 'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa']

# Read in the cleansed data
df = pd.read_csv('/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/nba_games_cleansed.csv')

# Filter down to home games and drop rows missing the columns of interest
cols_to_select = [
    'home', 'season', 'won', 'diff_pts', 'diff_orb', 'diff_tov', 
    'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa'
]
df = (
    df[df["home"] == 1]
    .reset_index(drop=True)[cols_to_select]
    .dropna(subset=cols_to_select)
)

# Group the cleansed data by season
grouped = df.groupby('season')

# Dictionary to store the differences in pseudo-R^2
pseudo_r_squared_diff = {}

for season_name, group in grouped:
    # Drop any remaining missing values for these columns
    group = group.dropna(subset=factors + ['won'])
    
    # 1) Fit the "full" model with all factors
    X_full = sm.add_constant(group[factors])
    y = group['won']
    full_model = sm.Logit(y, X_full).fit(disp=0)
    full_model_pr2 = full_model.prsquared
    
    # Prepare to store differences for this season
    pseudo_r_squared_diff[season_name] = {}
    
    # 2) For each factor, remove it and fit the "reduced" model
    for excluded_factor in factors:
        # Subset of factors that excludes the current factor
        reduced_factors = [f for f in factors if f != excluded_factor]
        
        X_reduced = sm.add_constant(group[reduced_factors])
        reduced_model = sm.Logit(y, X_reduced).fit(disp=0)
        reduced_model_pr2 = reduced_model.prsquared
        
        # 3) Difference in pseudo-R^2: full minus reduced
        pr2_diff = full_model_pr2 - reduced_model_pr2
        pseudo_r_squared_diff[season_name][excluded_factor] = pr2_diff

# Convert the dictionary of differences to a DataFrame
pseudo_r_squared_diff_df = pd.DataFrame(pseudo_r_squared_diff)

# Optionally, save to CSV
pseudo_r_squared_diff_df.to_csv('pseudo_r_squared_diff.csv', index=True)

# Print or inspect the results
print(pseudo_r_squared_diff_df)

              2016      2017      2018      2019      2020      2021  \
diff_orb  0.175430  0.146205  0.167032  0.158220  0.160648  0.128738   
diff_tov  0.160119  0.169243  0.181612  0.148361  0.173978  0.152386   
diff_3p%  0.478730  0.492192  0.513918  0.511672  0.540083  0.548000   
diff_2p%  0.507536  0.453106  0.523178  0.425223  0.464275  0.428764   
diff_ft%  0.085123  0.046407  0.061818  0.073270  0.063835  0.053987   
diff_fta  0.110276  0.091083  0.093997  0.085737  0.102303  0.071925   
diff_3pa  0.001327  0.000064  0.001087  0.000006  0.000057  0.000022   

              2022      2023      2024  
diff_orb  0.171716  0.189833  0.154112  
diff_tov  0.165838  0.198012  0.151326  
diff_3p%  0.567491  0.568607  0.519635  
diff_2p%  0.450816  0.470622  0.478417  
diff_ft%  0.066116  0.082106  0.052802  
diff_fta  0.080166  0.087963  0.074570  
diff_3pa  0.000050  0.000167  0.001372  


In [4]:
import pandas as pd
import statsmodels.api as sm

# The factors we want to test
factors = ['diff_orb', 'diff_tov', 'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa']

# Read in the cleansed data
df = pd.read_csv('/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/nba_games_cleansed.csv')

# Filter down to home games and drop rows missing the columns of interest
cols_to_select = [
    'home', 'season', 'won', 'diff_pts', 'diff_orb', 'diff_tov', 
    'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa'
]
df = (
    df[df["home"] == 1]                  # Keep only home games
    .reset_index(drop=True)[cols_to_select]
    .dropna(subset=cols_to_select)       # Drop rows with NaNs in the relevant columns
)

# Drop any remaining missing values for the factors + 'won'
df = df.dropna(subset=factors + ['won'])

# Dictionary to store the pseudo-R^2 of each “excluded-factor” model
pseudo_r_squared = {}

# Fit a logistic regression model for each factor excluded
for excluded_factor in factors:
    # Create a list of factors that excludes the current factor
    factors_subset = [f for f in factors if f != excluded_factor]

    X = df[factors_subset]
    y = df['won']
    
    # Add a constant term for the logistic model
    X = sm.add_constant(X)
    
    # Fit the logistic regression
    model = sm.Logit(y, X)
    result = model.fit(disp=0)
    
    # Print summary (optional)
    print(result.summary(title=f'Excluding {excluded_factor}'))
    
    # Store the pseudo R-squared for this model
    pseudo_r_squared[excluded_factor] = result.prsquared

# Convert the dictionary of results to a DataFrame
pseudo_r_squared_df = pd.DataFrame.from_dict(pseudo_r_squared, orient='index', columns=['Pseudo_R2_Excluded'])
pseudo_r_squared_df.to_csv('pseudo_r_squared_cleansed_total.csv', index=True)

print(pseudo_r_squared_df)

                              Excluding diff_orb                              
Dep. Variable:                    won   No. Observations:                11508
Model:                          Logit   Df Residuals:                    11501
Method:                           MLE   Df Model:                            6
Date:                Thu, 06 Mar 2025   Pseudo R-squ.:                  0.6163
Time:                        15:42:48   Log-Likelihood:                -3017.0
converged:                       True   LL-Null:                       -7863.5
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1842      0.033      5.596      0.000       0.120       0.249
diff_tov      -0.2815      0.008    -34.301      0.000      -0.298      -0.265
diff_3p%       0.2388      0.005     47.597      0.0

In [5]:
import pandas as pd
import statsmodels.api as sm

factors = ['diff_orb', 'diff_tov', 'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa']
df = pd.read_csv('/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/nba_games_cleansed.csv')

cols_to_select = [
    'home', 'season', 'won', 'diff_pts', 'diff_orb', 'diff_tov', 
    'diff_3p%', 'diff_2p%', 'diff_ft%', 'diff_fta', 'diff_3pa'
]
df = (
    df[df["home"] == 1]                  
    .reset_index(drop=True)[cols_to_select]
    .dropna(subset=cols_to_select)
)

df = df.dropna(subset=factors + ['won'])

# --- 1) Fit the FULL model with all factors
X_full = df[factors]
y = df['won']
X_full = sm.add_constant(X_full)

full_model = sm.Logit(y, X_full).fit(disp=0)
full_pr2 = full_model.prsquared

# --- 2) Compare each "excluded factor" model
pseudo_r2_dict = {}

for excluded_factor in factors:
    factors_subset = [f for f in factors if f != excluded_factor]
    
    X_reduced = sm.add_constant(df[factors_subset])
    reduced_model = sm.Logit(y, X_reduced).fit(disp=0)
    reduced_pr2 = reduced_model.prsquared
    
    pr2_diff = full_pr2 - reduced_pr2  # how much pseudo-R^2 drops
    
    pseudo_r2_dict[excluded_factor] = {
        'Full_Pseudo_R2': full_pr2,
        'Reduced_Pseudo_R2': reduced_pr2,
        'Delta_Pseudo_R2': pr2_diff
    }

pseudo_r2_df = pd.DataFrame.from_dict(pseudo_r2_dict, orient='index')
pseudo_r2_df.to_csv('pseudo_r_squared_diff_total.csv')
print(pseudo_r2_df)

          Full_Pseudo_R2  Reduced_Pseudo_R2  Delta_Pseudo_R2
diff_orb        0.770066           0.616322         0.153744
diff_tov        0.770066           0.611071         0.158995
diff_3p%        0.770066           0.256785         0.513281
diff_2p%        0.770066           0.312994         0.457072
diff_ft%        0.770066           0.706376         0.063690
diff_fta        0.770066           0.681898         0.088168
diff_3pa        0.770066           0.770042         0.000024
