In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
df = pd.read_csv('Seasons_Stats.csv')

# Print some information about the dataset
print("Original dataset shape:", df.shape)
print("\nYear column info:")
print(df['Year'].describe())
print("\nUnique years in the dataset:")
print(sorted(df['Year'].unique()))

# Clean up the data a bit
df = df.dropna()
print("\nDataset shape after dropping NA values:", df.shape)

# Convert 'Year' to datetime
df['Year'] = pd.to_datetime(df['Year'], format='%Y')

# Print year range
print("\nYear range in the dataset:")
print(f"From {df['Year'].min().year} to {df['Year'].max().year}")

# Filter the data
df_filtered = df[df['Year'].dt.year >= 1980]

print("\nDataset shape after filtering (year >= 1980):", df_filtered.shape)

# If df_filtered is empty, use the original df
if df_filtered.empty:
    print("Warning: Filtered dataset is empty. Using the entire dataset.")
    df_filtered = df

# Calculate PER if it's not there
if 'PER' not in df_filtered.columns:
    df_filtered['PER'] = (df_filtered['PTS'] + df_filtered['TRB'] + df_filtered['AST'] +
                          df_filtered['STL'] + df_filtered['BLK'] - df_filtered['PF'] -
                          (df_filtered['FGA'] - df_filtered['FG']) -
                          (df_filtered['FTA'] - df_filtered['FT']) - df_filtered['TOV']) / df_filtered['MP']

print("\nFirst few rows of the processed dataset:")
print(df_filtered.head())

# Check for infinite or NaN values in PER
if df_filtered['PER'].isin([float('inf'), float('-inf'), float('nan')]).any():
    print("Warning: PER contains infinite or NaN values. Removing these rows.")
    df_filtered = df_filtered[~df_filtered['PER'].isin([float('inf'), float('-inf'), float('nan')])]

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df_filtered[['PER', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'MP']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# PER vs Points scatter
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PTS', y='PER', data=df_filtered)
plt.title('PER vs Points per Game')
plt.xlabel('Points per Game')
plt.ylabel('PER')
plt.show()

# Box plot for PER by position
plt.figure(figsize=(10,6))
sns.boxplot(x='Pos', y='PER', data=df_filtered)
plt.title('PER by Position')
plt.xlabel('Position')
plt.ylabel('PER')
plt.show()

# Predict PER
X = df_filtered[['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV']]
y = df_filtered['PER']

# Check if X and y are not empty
if not X.empty and not y.empty:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse}")
    print(f"R2 Score: {r2}")

    # Plot actual vs predicted
    plt.figure(figsize=(10,6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual PER')
    plt.ylabel('Predicted PER')
    plt.title('Actual vs Predicted PER')
    plt.show()

    # Feature importance
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_})
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    print("Feature Importance:")
    print(feature_importance)
else:
    print("Error: Not enough data for modeling after filtering and cleaning.")

# PER trend over years
plt.figure(figsize=(12, 6))
df_filtered.groupby('Year')['PER'].mean().plot()
plt.title('Average PER Over Years')
plt.xlabel('Year')
plt.ylabel('Average PER')
plt.show()

# Top 10 players by PER
top_players = df_filtered.sort_values('PER', ascending=False).head(10)
print("\nTop 10 Players by PER:")
print(top_players[['Player', 'Year', 'Tm', 'PER']])

# TODO: Consider trying other models or focusing on specific time periods/player subsets for better understanding