<a href="https://colab.research.google.com/github/Hotchapu13/Fantasy-Premier-League/blob/main/xG_model_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Model Predicting Player's Expected Goals

This is the first basic machine learning model I am working on to predict expected goals of players in the premier league

In [9]:
import pandas as pd

# DataFrames.
Here I acquired the datasets from the 2021-22 to 2023-24 season and merged them to acquire one singular dataframe to use as historical data

In [10]:
raw_urls = [
    "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2021-22/gws/merged_gw.csv",
    "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2022-23/gws/merged_gw.csv",
    "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2023-24/gws/merged_gw.csv"
]

dfs = []

# Loading and concatenating the three datasets
for url in raw_urls:
  df = pd.read_csv(url)
  season = url.split("/")[9]
  df["Season"] = season
  dfs.append(df)

historical_df = pd.concat(dfs, ignore_index=True)

historical_df.describe()

Unnamed: 0,xP,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,...,transfers_in,transfers_out,value,yellow_cards,GW,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,starts
count,81677.0,81677.0,81677.0,81677.0,81677.0,81677.0,81677.0,81677.0,81677.0,81677.0,...,81677.0,81677.0,81677.0,81677.0,81677.0,56230.0,56230.0,56230.0,56230.0,56230.0
mean,1.214997,0.035836,0.089487,5.058682,0.081137,4.084973,362.857671,199.088887,0.458036,0.040048,...,15031.96,13356.78,49.373594,0.052144,20.905408,0.021186,0.054125,0.034324,0.375216,0.24414
std,2.097071,0.202527,0.448659,9.129115,0.273047,10.121414,213.80739,108.870468,0.952129,0.218741,...,63314.26,55089.25,11.324427,0.22232,10.988853,0.076491,0.17274,0.136918,0.743216,0.42958
min,-3.0,0.0,0.0,-21.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,36.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,180.0,106.0,0.0,0.0,...,22.0,79.0,44.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0
50%,0.2,0.0,0.0,0.0,0.0,0.0,359.0,204.0,0.0,0.0,...,253.0,672.0,45.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0
75%,1.9,0.0,0.0,7.0,0.0,1.4,538.0,294.0,1.0,0.0,...,3573.0,5916.0,52.0,0.0,31.0,0.0,0.000487,0.0,0.40995,0.0
max,23.6,4.0,3.0,128.0,1.0,136.2,866.0,380.0,9.0,4.0,...,2104464.0,2747279.0,145.0,1.0,38.0,1.47,3.88,2.77,9.84,1.0


In [11]:
# Load current season dataset
current_raw_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2024-25/gws/merged_gw.csv"

current_df = pd.read_csv(current_raw_url, on_bad_lines='skip')

current_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14178 entries, 0 to 14177
Data columns (total 42 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        14178 non-null  object 
 1   position                    14178 non-null  object 
 2   team                        14178 non-null  object 
 3   xP                          14178 non-null  float64
 4   assists                     14178 non-null  int64  
 5   bonus                       14178 non-null  int64  
 6   bps                         14178 non-null  int64  
 7   clean_sheets                14178 non-null  int64  
 8   creativity                  14178 non-null  float64
 9   element                     14178 non-null  int64  
 10  expected_assists            14178 non-null  float64
 11  expected_goal_involvements  14178 non-null  float64
 12  expected_goals              14178 non-null  float64
 13  expected_goals_conceded     141

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# Select features and target
features = ['assists','creativity','expected_goals','expected_goal_involvements','ict_index','goals_scored','minutes','starts', 'threat']
target = "expected_goals"

# Clean for missing values
historical = historical_df[features + [target]].dropna()
current = current_df[features + [target]].dropna()

# Splitting for training and testing
X_historical = historical[features]
y_historical = historical[target]

X_train, X_val, y_train, y_val = train_test_split(X_historical, y_historical, test_size=0.25, random_state=42)

# flattened y_train since the model fitting expects a numpy array and not a dataframe
y_train = y_train.values.ravel()

# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Base model training
base_model = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
base_model.fit(X_train, y_train)

# Evaluate on training data
y_val_pred = base_model.predict(X_val)
print(f"Base Model MAE: {mean_absolute_error(y_val, y_val_pred):.4f}")

Base Model MAE: 0.0003


In [13]:
# Fine tuning with current season data,
X_current = scaler.transform(current[features])
y_current = current[target]

# fine tuning with current season data
base_model.fit(X_current, y_current.values, xgb_model=base_model.get_booster())  # Continue training

# 📌 Step 6: Predict xG for the Upcoming Gameweek
# new_gameweek_data = np.array([[1, 50.2, 0.75, 1.1, 80.5, 1, 85, 1, 70.0]])  # Example player stats
new_gameweek_df = pd.DataFrame([{
    "assists": 1,
    "creativity": 50.2,
    "expected_goals": 0.75,
    "expected_goal_involvements": 1.1,
    "ict_index": 80.5,
    "goals_scored": 1,
    "minutes": 85,
    "starts": 1,
    "threat": 70.0
}])

# new_gameweek_scaled = scaler.transform(new_gameweek_df)
# predicted_xG = base_model.predict(new_gameweek_scaled)[0]

# print(predicted_xG)
# Ensure the new data is scaled and remains a DataFrame with the same columns
new_gameweek_scaled = pd.DataFrame(scaler.transform(new_gameweek_df), columns=features)
predicted_xG = base_model.predict(new_gameweek_scaled)[0]
print(predicted_xG)

# xP = predicted_xG * 3
# print("Expected points from goals: " + predicted_xG)
# print(xP)

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
