In [1]:
# Figuring out which Machine Learning Model is best 
# Random Forest, XGBoost, Neural networks
# importing processes to get the data from the fpl api
import requests
# converting json data to tables 
import pandas as pd
# Converting API data to tables
import numpy as np
# importing the machine learning models 
from sklearn.model_selection import train_test_split
# checking the predictions 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

# Title
print("="*60)
print("FPL ML MODEL - CHECKING WHICH MODEL IS BEST")
print("="*60)

# connect to the fpl api
print("\nGetting data from FPL API")
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
# passing the url into the response & making the fpl data into json
response = requests.get(url)
fpl_data = response.json()

# put player data into table
players_df = pd.DataFrame(fpl_data['elements'])
print(f"Total players = {len(players_df)} ")

# fpl features that are useful for application
features = [
    'minutes', 'goals_scored', 'assists', 'clean_sheets',
    'influence', 'creativity', 'threat', 'ict_index',
    'points_per_game', 'form'
]

# x = player stats
# y = total points (prediction)
# missing values are entered as 0
X = players_df[features].fillna(0)
# making all columns numeric
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
y = players_df['total_points']

# train & test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"using the most important stats from the fpl api {len(features)}")


# training & compare the 3 ml models 
print("\nTraining and comparing Random Forest, XGBoost, Neural Network")

# MODEL 1: Random Forest
print("Training Random Forest")
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
# training the model
rf_model.fit(X_train, y_train)
# making predictions 
rf_pred = rf_model.predict(X_test)
# calculating performance metrics 
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_corr = np.corrcoef(y_test, rf_pred)[0,1]

# MODEL 2: XGBoost
print("Training XGBoost")
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1)
# training the model
xgb_model.fit(X_train, y_train)
# making predictions 
xgb_pred = xgb_model.predict(X_test)
# calculating performance metrics 
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_corr = np.corrcoef(y_test, xgb_pred)[0,1]

# MODEL 3: Neural Network
print("Training Neural Network")
nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
# training the model
nn_model.fit(X_train, y_train)
# making predictions 
nn_pred = nn_model.predict(X_test)
# calculating performance metrics 
nn_r2 = r2_score(y_test, nn_pred)
nn_rmse = np.sqrt(mean_squared_error(y_test, nn_pred))
nn_corr = np.corrcoef(y_test, nn_pred)[0,1]

print("\n" + "="*60)
print("Comparing Models")
print("="*60)

# printing out each model & its correlation
# RMSE root mean squared error in points , lowest is the best
print(f"\n{'Model':<20} {'R² Score':<12} {'RMSE':<12} {'Correlation':<12}")
print("-"*60)
print(f"{'Random Forest':<20} {rf_r2:<12.3f} {rf_rmse:<12.2f} {rf_corr:<12.3f}")
print(f"{'XGBoost':<20} {xgb_r2:<12.3f} {xgb_rmse:<12.2f} {xgb_corr:<12.3f}")
print(f"{'Neural Network':<20} {nn_r2:<12.3f} {nn_rmse:<12.2f} {nn_corr:<12.3f}")

# find which model is the best out of the 3 
models = {
    'Random Forest': (rf_r2, rf_model),
    'XGBoost': (xgb_r2, xgb_model),
    'Neural Network': (nn_r2, nn_model)
}

# getting the model with the highest correlation
best_model_name = max(models, key=lambda x: models[x][0])
best_model = models[best_model_name][1]
#printing that model
print(f"\nBest model: {best_model_name}")

# showing the top 10 predicitons using the best model
print("\n" + "="*60)
print(f"TOP 10 PREDICTIONS (using {best_model_name})")
print("="*60)

# generating the predictions
players_df['predicted_points'] = best_model.predict(X)
# making positions readable
position_map = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
players_df['position'] = players_df['element_type'].map(position_map)
# conerting price from the api
players_df['price'] = players_df['now_cost'] / 10

# showing the top 10 players by predicted points
# shows player name, position, points, predicted points & price of player
top_10 = players_df.nlargest(10, 'predicted_points')[
    ['web_name', 'position', 'total_points', 'predicted_points', 'price']
]

# displaying the table without index numbers 
print(top_10.to_string(index=False))


FPL ML MODEL - CHECKING WHICH MODEL IS BEST

Getting data from FPL API
Total players = 748 
using the most important stats from the fpl api 10

Training and comparing Random Forest, XGBoost, Neural Network
Training Random Forest
Training XGBoost
Training Neural Network

Comparing Models

Model                R² Score     RMSE         Correlation 
------------------------------------------------------------
Random Forest        0.981        2.07         0.991       
XGBoost              0.984        1.89         0.992       
Neural Network       0.962        2.91         0.981       

Best model: XGBoost

TOP 10 PREDICTIONS (using XGBoost)
 web_name position  total_points  predicted_points  price
  Haaland      FWD            98         97.570366   14.8
  Gabriel      DEF            80         79.838646    6.6
  Semenyo      MID            75         74.877831    8.1
 J.Timber      DEF            66         64.194855    6.1
    Guéhi      DEF            64         64.128593    5.0
     