In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
file_path = r"C:\Users\Hamza\Downloads\all_seasons.csv" 
nba_data = pd.read_csv(file_path)

# Step 1: Exploratory Data Analysis (EDA)
print("Dataset Info:")
print(nba_data.info())
print("\nSummary Statistics:")
print(nba_data.describe())

# Visualize key statistics
plt.figure(figsize=(12, 6))
sns.histplot(nba_data['points'], kde=True, bins=30, color='blue')
plt.title("Distribution of Player Points")
plt.xlabel("Points")
plt.ylabel("Frequency")
plt.show()

# Check correlations between features
plt.figure(figsize=(10, 8))
sns.heatmap(nba_data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Step 2: Feature Engineering
# Generate new features if applicable (e.g., performance trends)
nba_data['points_per_game'] = nba_data['points'] / nba_data['games']
nba_data['assists_per_game'] = nba_data['assists'] / nba_data['games']
nba_data['rebounds_per_game'] = nba_data['rebounds'] / nba_data['games']

# Drop rows with missing or infinite values caused by division
nba_data.replace([np.inf, -np.inf], np.nan, inplace=True)
nba_data.dropna(inplace=True)

# Step 3: Feature Selection
# Choose relevant features for predicting player performance
selected_features = ['points_per_game', 'assists_per_game', 'rebounds_per_game', 'minutes_played']
X = nba_data[selected_features]
y = nba_data['game_score']  # Replace with the column for player performance

# Step 4: Data Standardization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train Models
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg_preds = lin_reg.predict(X_test)

# Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
ridge_reg_preds = ridge_reg.predict(X_test)

# Step 7: Evaluate Models
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} Performance:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R-Squared: {r2:.2f}")
    print("-" * 30)

evaluate_model("Linear Regression", y_test, lin_reg_preds)
evaluate_model("Ridge Regression", y_test, ridge_reg_preds)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, lin_reg_preds, alpha=0.7, color='green')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title("Linear Regression: Actual vs. Predicted")
plt.xlabel("Actual Game Score")
plt.ylabel("Predicted Game Score")
plt.show()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12844 entries, 0 to 12843
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         12844 non-null  int64  
 1   player_name        12844 non-null  object 
 2   team_abbreviation  12844 non-null  object 
 3   age                12844 non-null  float64
 4   player_height      12844 non-null  float64
 5   player_weight      12844 non-null  float64
 6   college            10990 non-null  object 
 7   country            12844 non-null  object 
 8   draft_year         12844 non-null  object 
 9   draft_round        12844 non-null  object 
 10  draft_number       12844 non-null  object 
 11  gp                 12844 non-null  int64  
 12  pts                12844 non-null  float64
 13  reb                12844 non-null  float64
 14  ast                12844 non-null  float64
 15  net_rating         12844 non-null  float64
 16  oreb_pct

KeyError: 'points'

<Figure size 1200x600 with 0 Axes>