In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from numpy.linalg import LinAlgError
pd.set_option('display.max_columns', None)

In [7]:
import pandas as pd

# Load the data
df = pd.read_csv('/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/nba_games_cleansed.csv')

# Define the columns to select
cols_to_select = [
    'home', 'season', 'won', 'diff_pts', 'orb', 'orb_opp', 'diff_orb', 'tov', 'tov_opp', 'diff_tov', '3p%', '3p%_opp', 'diff_3p%', '2p%', '2p%_opp', 'diff_2p%', 'ft%', 'ft%_opp', 'diff_ft%', 'fta', 'fta_opp', 'diff_fta', '3pa', '3pa_opp', 'diff_3pa'
]

# Filter the DataFrame for home games and drop rows with null values in the selected columns
df = df[df["home"] == 1].reset_index(drop=True)[cols_to_select].dropna(subset=cols_to_select)

# Create a DataFrame for winning team stats
winning_stats_1 = df[df['won'] == 1][[
    'orb', 'tov', '3p%', '2p%', 'ft%', 'fta', '3pa'
]].rename(columns={
    'orb': 'winning_orb', 'tov': 'winning_tov', '3p%': 'winning_3p%', '2p%': 'winning_2p%', 'ft%': 'winning_ft%', 'fta': 'winning_fta', '3pa': 'winning_3pa'
})

# Create a DataFrame for losing team stats
losing_stats_1 = df[df['won'] == 0][[
    'orb_opp', 'tov_opp', '3p%_opp', '2p%_opp', 'ft%_opp', 'fta_opp', '3pa_opp'
]].rename(columns={
    'orb_opp': 'winning_orb', 'tov_opp': 'winning_tov', '3p%_opp': 'winning_3p%', '2p%_opp': 'winning_2p%', 'ft%_opp': 'winning_ft%', 'fta_opp': 'winning_fta', '3pa_opp': 'winning_3pa'
})

# Combine winning and losing stats
winning_stats = pd.concat([winning_stats_1, losing_stats_1])

# Create a DataFrame for losing team stats
losing_stats_2 = df[df['won'] == 1][[
    'orb_opp', 'tov_opp', '3p%_opp', '2p%_opp', 'ft%_opp', 'fta_opp', '3pa_opp'
]].rename(columns={
    'orb_opp': 'losing_orb', 'tov_opp': 'losing_tov', '3p%_opp': 'losing_3p%', '2p%_opp': 'losing_2p%', 'ft%_opp': 'losing_ft%', 'fta_opp': 'losing_fta', '3pa_opp': 'losing_3pa'
})

# Create a DataFrame for winning team stats
winning_stats_2 = df[df['won'] == 0][[
    'orb', 'tov', '3p%', '2p%', 'ft%', 'fta', '3pa'
]].rename(columns={
    'orb': 'losing_orb', 'tov': 'losing_tov', '3p%': 'losing_3p%', '2p%': 'losing_2p%', 'ft%': 'losing_ft%', 'fta': 'losing_fta', '3pa': 'losing_3pa'
})

# Combine losing and winning stats
losing_stats = pd.concat([losing_stats_2, winning_stats_2])

winning_stats.columns = [col.replace('winning_', '') for col in winning_stats.columns]
losing_stats.columns = [col.replace('losing_', '') for col in losing_stats.columns]

# Calculate the average statistics for winning and losing teams
winning_avg = winning_stats.mean()
losing_avg = losing_stats.mean()

# Calculate the difference between winning and losing team statistics
difference_avg = winning_avg - losing_avg

# Combine the results into a single DataFrame
result_df = pd.DataFrame({
    'Winning Team Average': winning_avg,
    'Losing Team Average': losing_avg,
    'Difference': difference_avg
})

# Display the result
result_df

Unnamed: 0,Winning Team Average,Losing Team Average,Difference
orb,10.119395,10.254345,-0.13495
tov,12.94569,13.781022,-0.835332
3p%,38.868917,32.717918,6.150999
2p%,54.605965,50.2175,4.388465
ft%,78.458611,76.213643,2.244969
fta,23.322819,21.89503,1.427789
3pa,31.69708,31.63521,0.06187


In [28]:
years = range(2016, 2025)

results = []

for year in years:
    file_path = f'/Users/moneysniper/Documents/NBA_analysis_project/gamelogs/regular_season_{year}.csv'
    df = pd.read_csv(file_path)

    # Define the columns to select
    cols_to_select = [
        'home', 'season', 'won', 'diff_pts', 'orb', 'orb_opp', 'diff_orb', 'tov', 'tov_opp', 'diff_tov', '3p%', '3p%_opp', 'diff_3p%', '2p%', '2p%_opp', 'diff_2p%', 'ft%', 'ft%_opp', 'diff_ft%', 'fta', 'fta_opp', 'diff_fta', '3pa', '3pa_opp', 'diff_3pa'
    ]

    # Filter the DataFrame for home games and drop rows with null values in the selected columns
    df = df[df["home"] == 1].reset_index(drop=True)[cols_to_select].dropna(subset=cols_to_select)

    # Create a DataFrame for winning team stats
    winning_stats_1 = df[df['won'] == 1][[
        'orb', 'tov', '3p%', '2p%', 'ft%', 'fta', '3pa'
    ]].rename(columns={
        'orb': 'winning_orb', 'tov': 'winning_tov', '3p%': 'winning_3p%', '2p%': 'winning_2p%', 'ft%': 'winning_ft%', 'fta': 'winning_fta', '3pa': 'winning_3pa'
    })

    # Create a DataFrame for losing team stats
    losing_stats_1 = df[df['won'] == 0][[
        'orb_opp', 'tov_opp', '3p%_opp', '2p%_opp', 'ft%_opp', 'fta_opp', '3pa_opp'
    ]].rename(columns={
        'orb_opp': 'winning_orb', 'tov_opp': 'winning_tov', '3p%_opp': 'winning_3p%', '2p%_opp': 'winning_2p%', 'ft%_opp': 'winning_ft%', 'fta_opp': 'winning_fta', '3pa_opp': 'winning_3pa'
    })

    # Combine winning and losing stats
    winning_stats = pd.concat([winning_stats_1, losing_stats_1])

    # Create a DataFrame for losing team stats
    losing_stats_2 = df[df['won'] == 1][[
        'orb_opp', 'tov_opp', '3p%_opp', '2p%_opp', 'ft%_opp', 'fta_opp', '3pa_opp'
    ]].rename(columns={
        'orb_opp': 'losing_orb', 'tov_opp': 'losing_tov', '3p%_opp': 'losing_3p%', '2p%_opp': 'losing_2p%', 'ft%_opp': 'losing_ft%', 'fta_opp': 'losing_fta', '3pa_opp': 'losing_3pa'
    })

    # Create a DataFrame for winning team stats
    winning_stats_2 = df[df['won'] == 0][[
        'orb', 'tov', '3p%', '2p%', 'ft%', 'fta', '3pa'
    ]].rename(columns={
        'orb': 'losing_orb', 'tov': 'losing_tov', '3p%': 'losing_3p%', '2p%': 'losing_2p%', 'ft%': 'losing_ft%', 'fta': 'losing_fta', '3pa': 'losing_3pa'
    })

    # Combine losing and winning stats
    losing_stats = pd.concat([losing_stats_2, winning_stats_2])

    # Rename columns by removing 'winning_' and 'losing_' prefixes
    winning_stats.columns = [col.replace('winning_', '') for col in winning_stats.columns]
    losing_stats.columns = [col.replace('losing_', '') for col in losing_stats.columns]

    # Calculate the average statistics for winning and losing teams
    winning_avg = winning_stats.mean()
    losing_avg = losing_stats.mean()

    # Calculate the difference between winning and losing team statistics
    difference_avg = winning_avg - losing_avg
    
    # Append the results for the current year to the results list
    results_year = pd.DataFrame({
        'Year': year,
        'Winning Team Average': winning_avg,
        'Losing Team Average': losing_avg,
        'Difference': difference_avg
    })
    
    results = pd.concat(results_year)
    
results


Unnamed: 0,Year,Winning Team Average,Losing Team Average,Difference
0,2016,orb 10.270732 tov 13.300813 3p% 38.34...,orb 10.561789 tov 14.289431 3p% 32.04...,orb -0.291057 tov -0.988618 3p% 6.30187...
1,2017,orb 9.978862 tov 12.935772 3p% 39.06...,orb 10.295122 tov 13.809756 3p% 32.24...,orb -0.316260 tov -0.873984 3p% 6.82756...
2,2018,orb 9.671545 tov 13.268293 3p% 38.89...,orb 9.752033 tov 14.082927 3p% 33.30...,orb -0.080488 tov -0.814634 3p% 5.58422...
3,2019,orb 10.380797 tov 13.115541 3p% 38.53...,orb 10.315704 tov 13.946298 3p% 32.49...,orb 0.065094 tov -0.830757 3p% 6.04100...
4,2020,orb 10.093484 tov 13.663834 3p% 38.64...,orb 10.057602 tov 14.164306 3p% 32.87...,orb 0.035883 tov -0.500472 3p% 5.77648...
5,2021,orb 9.796296 tov 12.982407 3p% 39.85...,orb 9.862963 tov 13.469444 3p% 33.22...,orb -0.066667 tov -0.487037 3p% 6.63768...
6,2022,orb 10.232950 tov 12.590038 3p% 38.44...,orb 10.337165 tov 13.501916 3p% 32.16...,orb -0.104215 tov -0.911877 3p% 6.27540...
7,2023,orb 10.408742 tov 12.928681 3p% 38.80...,orb 10.473160 tov 13.818252 3p% 32.91...,orb -0.064417 tov -0.889571 3p% 5.88964...
8,2024,orb 10.382239 tov 12.380695 3p% 39.35...,orb 10.688803 tov 13.277992 3p% 33.45...,orb -0.306564 tov -0.897297 3p% 5.89930...
