In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

nba_df = pd.read_csv('player_totals.csv', encoding = 'unicode_escape', engine ='python')

# Convert the "season" column to numeric if it's not already
nba_df['Season'] = pd.to_numeric(nba_df['Season'], errors='coerce')

# Drop rows where the "season" is lower than 2004
nba_df = nba_df[nba_df['Season'] >= 2004]

missing_data = nba_df.isnull().sum()

print("Columns with missing data:")
print(missing_data[missing_data > 0])

columns_to_fill = ['FG%', '3P%', '2P%', 'eFG%', 'FT%' ]

nba_df[columns_to_fill] = nba_df[columns_to_fill].fillna(0)

missing_data_updated = nba_df[columns_to_fill].isnull().sum()

print('Columns with missing data after filling:')
print(missing_data_updated)
print('-----------------------------------------------------')

X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%", "TRB", "AST", "PF"]]

# Target variables
y_pts = nba_df["PTS"]
y_ast = nba_df["AST"]
y_trb = nba_df["TRB"]


# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test, y_ast_train, y_ast_test, y_trb_train, y_trb_test = train_test_split(
    X, y_pts, y_ast, y_trb, test_size=0.4, random_state=42
)

# Create and train linear regression models
model_pts = LinearRegression().fit(X_train, y_pts_train)
model_ast = LinearRegression().fit(X_train, y_ast_train)
model_trb = LinearRegression().fit(X_train, y_trb_train)

# Predictions
y_pts_pred = model_pts.predict(X_test)
y_ast_pred = model_ast.predict(X_test)
y_trb_pred = model_trb.predict(X_test)



# Predictions
y_pts_pred = model_pts.predict(X_test)
y_ast_pred = model_ast.predict(X_test)
y_trb_pred = model_trb.predict(X_test)

# Evaluation for Points (PTS)
mse_pts = mean_squared_error(y_pts_test, y_pts_pred)
mae_pts = mean_absolute_error(y_pts_test, y_pts_pred)
r2_pts = r2_score(y_pts_test, y_pts_pred)

print("Metrics for Points (PTS):")
print(f"MSE: {mse_pts}")
print(f"MAE: {mae_pts}")
print(f"R-squared: {r2_pts}")
print()

# Evaluation for Assists (AST)
mse_ast = mean_squared_error(y_ast_test, y_ast_pred)
mae_ast = mean_absolute_error(y_ast_test, y_ast_pred)
r2_ast = r2_score(y_ast_test, y_ast_pred)

print("Metrics for Assists (AST):")
print(f"MSE: {mse_ast}")
print(f"MAE: {mae_ast}")
print(f"R-squared: {r2_ast}")
print()

# Evaluation for Total Rebounds (TRB)
mse_trb = mean_squared_error(y_trb_test, y_trb_pred)
mae_trb = mean_absolute_error(y_trb_test, y_trb_pred)
r2_trb = r2_score(y_trb_test, y_trb_pred)

print("Metrics for Total Rebounds (TRB):")
print(f"MSE: {mse_trb}")
print(f"MAE: {mae_trb}")
print(f"R-squared: {r2_trb}")




Columns with missing data:
FG%      100
3P%     1714
2P%      190
eFG%     100
FT%      789
dtype: int64
Columns with missing data after filling:
FG%     0
3P%     0
2P%     0
eFG%    0
FT%     0
dtype: int64
-----------------------------------------------------
Metrics for Points (PTS):
MSE: 22098.258178841847
MAE: 94.46443038342278
R-squared: 0.888748911768836

Metrics for Assists (AST):
MSE: 1.399215222820522e-26
MAE: 9.681701623802313e-14
R-squared: 1.0

Metrics for Total Rebounds (TRB):
MSE: 2.2977588423136005e-26
MAE: 1.1079743299278299e-13
R-squared: 1.0
