In [171]:
import pandas as pd

pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from datetime import datetime

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.*")
%matplotlib inline

In [172]:
nba_df = pd.read_csv('nba_stats_22-23.csv')
nba_df = nba_df.rename(columns={"Unnamed: 0": "ID"})
nba_df.columns = [col.replace(" ", "") for col in nba_df.columns]
nba_df



In [173]:
nba_df2 = pd.read_csv('nba_stats_23-24.csv')
nba_df2 = nba_df2[["Age",
    "GP",
    "TRB",
    "AST",
    "PTS",
    "BLK",
    "TS%",
    "Salary"
    ]]

nba_df2 = nba_df2[nba_df2['GP'] >= 20]
nba_df2 = nba_df2.dropna(subset=['Salary'])
# Also drop rows where SALARY is 0.0
nba_df2 = nba_df2[nba_df2['Salary'] > 0.0]

total_missing = nba_df2.isnull().sum().sort_values(ascending=False)
percent_missing = (nba_df2.isnull().sum() / nba_df.isnull().count()).sort_values(ascending=False)

missing_data_df = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing_data_df.head(8)
nba_df2



In [174]:
nba_df.columns



In [175]:
total_missing = nba_df.isnull().sum().sort_values(ascending=False)
percent_missing = (nba_df.isnull().sum() / nba_df.isnull().count()).sort_values(ascending=False)

missing_data_df = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing_data_df.head(8)



In [176]:
cols_to_fill_zero = [
    "FT%",
    "3P%",
    "2P%",
    "TS%",
    "3PAr",
    "FTr",
    "eFG%",
    "FG%",
]

for col in cols_to_fill_zero:
    nba_df[col] = nba_df[col].fillna(0)

In [177]:
fig, ax = plt.subplots()
ax.scatter(x=nba_df["GP"], y=nba_df["Salary"])
plt.ylabel("Salary", fontsize=13)
plt.xlabel("GP (Games Played)", fontsize=13)
plt.show()



In [178]:
nba_df = nba_df[nba_df['GP'] >= 20]


In [179]:
# scatterplot
cols = [
    "Salary",
    "Age",
    "MP",
    "3P",
    "TRB",
    "AST",
    "PTS",
    "PER",
    "TS%",
    "DWS",
    "VORP"
]
sns.pairplot(nba_df[cols], size=2.5)
plt.show();



In [180]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [181]:
X = nba_df[["Age",
    "MP",
    "3P",
    "TRB",
    "AST",
    "PTS",
    "PER",
    "TS%",
    "DWS",
    "VORP"]]
y = nba_df["Salary"]
    

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 512)

In [183]:
lm = LinearRegression()
lm.fit(X_train,y_train)



In [184]:
lm.score(X_train, y_train)



In [185]:
import statsmodels.api as sm

In [186]:
# You may also simply add a column using code below to create the DataFrame with a constant
X_train['const'] = 1
X_train.head()



In [187]:
# Fit a model using OLS 
model = sm.OLS(y_train, X_train)
model_results = model.fit()

In [188]:
# Return the summary tables of the regression results
model_results.summary2()



In [189]:
def adjust_salary(salary, years):
    """
    Adjusts the player's salary based on their income.
    - Superstars (> $15M) get 9% annual growth.
    - Role players (≤ $15M) get 25% annual growth.

    Returns:
    - float: Adjusted salary.
    """
    if salary > 15_000_000:
        annual_growth_rate = 0.05  # Superstars
    else:
        annual_growth_rate = 0.15  # Role players

    adjusted_salary = salary * ((1 + annual_growth_rate) ** years)
    return round(adjusted_salary, 2)

years_forward = 2

nba_df["Salary"] = nba_df["Salary"].apply(lambda x: adjust_salary(x, years_forward))

years_forward = 1

nba_df2["Salary"] = nba_df2["Salary"].apply(lambda x: adjust_salary(x, years_forward))

combined_data = pd.concat([nba_df[["Age", "GP", "TRB", "AST", "PTS", "BLK", "TS%", "Salary"]], nba_df2], axis=0)

X2 = combined_data[["Age",
    "GP",
    "TRB",
    "AST",
    "PTS",
    "BLK",
    "TS%",
    ]]

y = combined_data["Salary"]

combined_data



In [190]:
X2_train, X2_test, y_train, y_test = train_test_split(X2, y,
                                                    test_size = 0.25,
                                                    random_state = 512)

In [191]:
lm2 = LinearRegression()
lm2.fit(X2_train,y_train)



In [192]:
lm2.score(X2_train, y_train)



In [193]:
X2_train['const'] = 1
X2_train.head()



In [194]:
model = sm.OLS(y_train, X2_train)
model_results = model.fit()
model_results.summary2()



In [195]:
y_model_pred = lm2.predict(X2_train.iloc[:, 0:7]) # Use X_train2 which is before standardization
y_model_pred



In [196]:
results = pd.DataFrame({'Actual': y_train, 'Predicted': y_model_pred})
results['residuals'] = results['Actual'] - results['Predicted']
results.head()



In [197]:
# Plot scatter plot of the residuals
plt.scatter(results['Predicted'], results['residuals'],  color='blue')



In [198]:
# Plot histogram of the errors (residuals)
plt.hist(results['residuals'], bins=20)



In [199]:
# Use statistical test such as K-S test to test for normality

from scipy import stats
from scipy.stats import norm, kstest

np.random.seed(500)

# Null hypothesis: distributions are the same
# Alternative hypothesis: distributions are NOT the same

# getting the loc (mean) and scale (std dev) for residuals
loc, scale = norm.fit(results['residuals'])

# create a normal distribution with loc and scale
n = norm(loc=loc, scale=scale)

# perform the KS-test using residuals compared to a normal distribution in the correct scale
stat, p = stats.kstest(results['residuals'], n.cdf)

# print p-value: if p-value is <0.05 (5% significance), then reject null hypothesis 
print('Residuals: Statistics=%.3f, p=%.9f' % (stat, p))



In [200]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [201]:
for i in range(0,7):
    v = vif(X2_train.values, i)
    print("VIF for {}:{}".format(X2_train.columns[i], round(v, 4)))



In [202]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
sns.heatmap(X2_train.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.show()






In [203]:
# Using joblib (recommended)
from joblib import dump, load

# Save the model
dump(lm2, 'linear_regression_model.joblib')

