# Modeling Player Data to Predict Rating
## Regression (OLS, Ridge, Lasso) 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

sns.set(style='whitegrid')

np.random.seed(42)

In [None]:
player_cleaned_df = pd.read_csv('../data/cleaned/player_attributes_cleaned.csv')
#player_cleaned_df.set_index(['player_api_id', 'player_name', 'birthday', 'height', 'weight', 'date'], inplace=True)
player_cleaned_df

In [None]:
#Add player age column that stores age of player in years for each rating
player_cleaned_df['age'] = (
    pd.to_datetime(player_cleaned_df['date']) - pd.to_datetime(player_cleaned_df['birthday'])
).dt.days / 365.25
# Move 'age' column to the right of 'date'
cols = list(player_cleaned_df.columns)
cols.insert(cols.index('date') + 1, cols.pop(cols.index('age')))
player_cleaned_df = player_cleaned_df[cols]
player_cleaned_df = player_cleaned_df.drop(columns=['player_name', 'birthday'])

player_cleaned_df



In [None]:
X = player_cleaned_df.select_dtypes(include=['int64', 'float64']).drop(columns=['overall_rating', 'player_api_id'])
y = player_cleaned_df['overall_rating']
X

In [None]:

ols = LinearRegression().fit(X, y)
y_ols = ols.predict(X)
print("OLS Coefs:", ols.coef_)
print("OLS R^2:", r2_score(y, y_ols))

def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

total_loss = mse_loss(y, y_ols)
print("Total MSE Loss:", total_loss)


In [None]:
X

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(x=y, y=y_ols)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # ideal diagonal
plt.xlabel("True Y")
plt.ylabel("Predicted Y")
plt.title("True vs. Predicted Values (OLS Regression)")
plt.tight_layout()
plt.show()

In [None]:
# Fit Ridge
ridge = Ridge(alpha=1.0,solver='sparse_cg').fit(X, y)
y_ridge = ridge.predict(X)
print("Ridge Coefs:", ridge.coef_)
print("Ridge R^2:", r2_score(y, y_ridge))
print(ridge.coef_[0] / ridge.coef_[1])

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(x=y, y=y_ridge)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # ideal diagonal
plt.xlabel("True Y")
plt.ylabel("Predicted Y")
plt.title("True vs. Predicted Values (Ridge Regression)")
plt.tight_layout()
plt.show()

In [None]:
# Fit Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)
y_pred = lasso.predict(X)
print("Lasso R^2:", r2_score(y, y_pred))
print("Nonzero Coefficients:", np.sum(lasso.coef_ != 0))

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(x=y, y=y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # ideal diagonal
plt.xlabel("True Y")
plt.ylabel("Predicted Y")
plt.title("True vs. Predicted Values (Lasso Regression)")
plt.tight_layout()
plt.show()