In [2]:
import pandas as pd
from pybaseball import batting_stats, pitching_stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

def fetch_war_data(year):
  data = batting_stats(year, qual=100)
  return data [['Name', 'Age', 'Season', 'WAR', 'OBP', 'SLG', 'HR', 'RBI']]

def prepare_dataset(start_year, end_year):
  frames = []
  for year in range(start_year, end_year + 1):
    data = fetch_war_data(year)
    frames.append(data)
  return pd.concat(frames, ignore_index=True)

def create_features(data):
  data = data.sort_values(by=['Name', 'Season'])
  data['WAR_last_year'] = data.groupby('Name')['WAR'].shift(1)
  data['WAR_2_years_ago'] = data.groupby('Name')['WAR'].shift(2)
  data['WAR_3_years_ago'] = data.groupby('Name')['WAR'].shift(3)
  data = data.fillna(0)
  return data

def train_model(data):
    features = ['Age', 'WAR_last_year', 'WAR_2_years_ago', 'WAR_3_years_ago', 'OBP', 'SLG', 'HR', 'RBI']
    target = 'WAR'
    X = data[features]
    y = data[target]

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Model Evaluation:")
    print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
    print("R^2 Score:", r2_score(y_test, y_pred))

    return model

def predict_next_year(model, current_year_data):
  return model.predict(current_year_data)

if __name__ == "__main__":
  start_year = 2015
  end_year = 2023
  data = prepare_dataset(start_year, end_year)

  data = create_features(data)
  model = train_model(data)

  current_year = 2024
  current_data = fetch_war_data(current_year)
  current_data = create_features(current_data)

  predictions = predict_next_year(model, current_data[['Age', 'WAR_last_year', 'WAR_2_years_ago', 'WAR_3_years_ago', 'OBP', 'SLG', 'HR', 'RBI']])
  current_data['Predicted_WAR'] = predictions

  print(current_data[['Name', 'Predicted_WAR']].sort_values(by='Predicted_WAR', ascending = False).head(10))

Model Evaluation:
RMSE: 0.9097256826109951
R^2 Score: 0.6966919862020291




                     Name  Predicted_WAR
0             Aaron Judge          8.697
2               Juan Soto          7.978
1           Shohei Ohtani          7.008
7   Vladimir Guerrero Jr.          6.126
5          Bobby Witt Jr.          5.908
16       Gunnar Henderson          5.273
9            Brent Rooker          4.991
12            Ketel Marte          4.788
6          Yordan Alvarez          4.773
37      William Contreras          4.654
