# XGBoost Notebook

This Notebook trains an XGBoost model, predicts the player's points for the upcoming gameweeks, and saves these predictions.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from tabulate import tabulate
import requests
import pickle

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import pulp

### Load data

In [7]:
# Load the saved JSON data from the file
with open("data/overall_data.json", "r") as f:
    data = json.load(f)
players = data['elements']

# Create player_overall_stats
player_overall_stats = {}
for player in players:
    player_id = str(player['id'])
    player_overall_stats[player_id] = player

In [None]:
# Load training data
X = np.load('data/xgboost/X_train.npy')
y = np.load('data/xgboost/y_train.npy')
played = np.load('data/xgboost/played_train.npy') # Optional: Used to remove data where a player did not play

print("Training data (X):", X.shape)
print("Target values (y):", y.shape)
print("Played values (y):", played.shape)

In [None]:
# Load test data
with open('data/xgboost/X_test.pkl', 'rb') as f:
    X_test_games = pickle.load(f)

with open('data/xgboost/xP_test.pkl', 'rb') as f:
    eP_games = pickle.load(f)

with open('data/xgboost/names_test.pkl', 'rb') as f:
    player_names_games = pickle.load(f)
    
with open('data/xgboost/id_dict.pkl', 'rb') as f:
    id_dict = pickle.load(f)
    
with open('data/xgboost/name_to_id.pkl', 'rb') as f:
    name_to_id = pickle.load(f)

### Train model

In [None]:
# Split the data into training and testing sets
y_clipped = np.clip(y, 0, 12)
X_train, X_val, y_train, y_val, played_train, played_val = train_test_split(X,y_clipped,played,test_size=0.2,random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Put more weight on high-scoring players
weights = np.where(y_train > 2, 1.5, 1.0)

# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=50)
model.fit(X_train, y_train, sample_weight=weights)

# Predict and evaluate
predictions = model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
print(f'Mean Squared Error: {mse:.2f}')

### Show results - Training set

In [None]:
# Predict and evaluate
predictions = model.predict(X_train)
mse = mean_squared_error(y_train, predictions)
print(f'Mean Squared Error: {mse:.2f}')

# Sort values for a better line plot
sorted_indices = np.argsort(y_train)
sorted_y_train = y_train[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_y_train, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Expected vs True Values for Training Set')
plt.legend()
plt.grid(True)
plt.show()

### Show results - Validation set

In [None]:
# Predict and evaluate
predictions = model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
print(f'Mean Squared Error: {mse:.2f}')

# Sort values for a better line plot
sorted_indices = np.argsort(y_val)
sorted_y_val = y_val[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_y_val, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Expected vs True Values for Validation Set')
plt.legend()
plt.grid(True)
plt.show()

### Predictions - next gameweek

In [None]:
# Get test data
X_test = X_test_games[0]
X_test = scaler.transform(X_test)
y_test = eP_games[0]
player_ids_test = player_names_games[0]

# Make predictions on the test data
predictions = model.predict(X_test)
mse = mean_squared_error(eP_games[0], predictions)
print(f'Mean Squared Error: {mse:.2f}')

# Sort values for a better line plot
sorted_indices = np.argsort(eP_games[0])
sorted_ep = eP_games[0][sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_ep, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Predictions vs FPL xP')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Sum predicted scores for next 5 gameweeks
predictions_5 = []
for i in range(5):
    X_test = X_test_games[i]
    X_test = scaler.transform(X_test)
    predictions_5.append(model.predict(X_test))
predictions_sum = np.sum(np.array(predictions_5), axis=0)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions)[-10:][::-1]

# Prepare data for top 10 predictions table
top_10_data = []
for num, i in enumerate(top_10_indices):
    name = player_overall_stats[name_to_id[player_ids_test[i]]]['web_name']
    play_chance = player_overall_stats[name_to_id[player_ids_test[i]]]['chance_of_playing_next_round']
    top_10_data.append([num + 1, name, f'{predictions[i]:.2f}', f'{y_test[i]:.2f}', 
                        f'{predictions_sum[i]:.2f}', f'{play_chance}'])

# Print the top 10 predictions as a table
print("Top 10 Highest Predicted Players:")
print(tabulate(top_10_data, headers=["Rank", "Player", "Predicted Score", "FPL xP", 
                                     "Sum next 5 games", "Chance of playing"], tablefmt="pretty"))

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = name_to_id[player_ids_test[i]]
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    play_chance = player_overall_stats[name_to_id[player_ids_test[i]]]['chance_of_playing_next_round']
    
    # Store the player data as a tuple and append to corresponding position
    player_data = (predictions[i], y_test[i], pp['web_name'], predictions_sum[i], play_chance)
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    position_data = []
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name, sum5, play_chance = player
        position_data.append([num, web_name, f'{predicted_score:.2f}', f'{actual_score:.2f}', f'{sum5:.2f}',
                             f'{play_chance}'])
    
    # Print position-specific table
    print(tabulate(position_data, headers=["Rank", "Player", "Predicted Score", "FPL xP",
                                           "Sum next 5 games", "Chance of playing"], tablefmt="pretty"))

In [None]:
# Highest predictions (top 10)
top_10_indices = np.argsort(predictions_sum.flatten())[-10:][::-1]

# Prepare data for top 10 predictions table
top_10_data = []
for num, i in enumerate(top_10_indices):
    name = player_overall_stats[name_to_id[player_ids_test[i]]]['web_name']
    play_chance = player_overall_stats[name_to_id[player_ids_test[i]]]['chance_of_playing_next_round']
    top_10_data.append([num + 1, name, f'{predictions[i]:.2f}', f'{y_test[i]:.2f}', 
                        f'{predictions_sum[i]:.2f}', f'{play_chance}'])

# Print the top 10 predictions as a table
print("Top 10 Highest Predicted Players, sorted by sum of next 5 games:")
print(tabulate(top_10_data, headers=["Rank", "Player", "Predicted Score", "FPL xP", 
                                     "Sum next 5 games", "Chance of playing"], tablefmt="pretty"))

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = name_to_id[player_ids_test[i]]
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    play_chance = player_overall_stats[name_to_id[player_ids_test[i]]]['chance_of_playing_next_round']
    
    # Store the player data as a tuple and append to corresponding position
    player_data = (predictions[i], y_test[i], pp['web_name'], predictions_sum[i], play_chance)
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[3], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    position_data = []
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name, sum5, play_chance = player
        position_data.append([num, web_name, f'{predicted_score:.2f}', f'{actual_score:.2f}', f'{sum5:.2f}',
                             f'{play_chance}'])
    
    # Print position-specific table
    print(tabulate(position_data, headers=["Rank", "Player", "Predicted Score", "FPL xP",
                                           "Sum next 5 games", "Chance of playing"], tablefmt="pretty"))

### Load my team data

In [None]:
user_id = 10581845
url = f'https://fantasy.premierleague.com/api/entry/{user_id}/event/{11}/picks/'

response = requests.get(url)
data = response.json()

In [None]:
# Prepare a mapping of positions
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
grouped_data = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Populate grouped data based on position
for player in data['picks']:
    player_name = id_dict[str(player["element"])]
    name = player_overall_stats[str(player['element'])]['web_name']
    prediction = predictions[player_ids_test.index(player_name)]
    sum5 = predictions_sum[player_ids_test.index(player_name)]
    fpl_xp = y_test[player_ids_test.index(player_name)]
    player_position = position_mapping[player_overall_stats[str(player['element'])]['element_type']]
    grouped_data[player_position].append([name, f"{prediction:.2f}", f'{fpl_xp:.2f}', f"{sum5:.2f}"])

# Combine to one table
combined_data = []
for position, players in grouped_data.items():
    combined_data.append([f"--- {position} ---", "", "", ""])
    for player in players:
        combined_data.append(player)

# Print the combined table
print('Current squad')
print(tabulate(combined_data, headers=['Player Name', 'Predicted Score', "FPL xP", 'Sum next 5 games'], tablefmt="pretty"))

### Optimization problem

**Constraints**

Squad constraints:
- 2 GK, 5 DEF, 5 MID, 3 FWD in squad (15 players total)
- Max 3 players from each team in squad

Start 11 constraints:
- Exactly 1 GK
- At least 3 DEF
- At least 2 MID
- At least 1 FWD
- Exactly 11 players total

Transfer constraints:
- Must not exceed total bank balance
- Cost of extra transfers

**Optimization**

- Optimize points scored by start 11
- Include points for following gameweeks
- Prioritise not to make transfers if little/no points gain (to save transfers for next rounds)

In [None]:
squad = []
for player in data['picks']:
    squad.append(str(player['element']))

In [None]:
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
players = []

for key, value in player_overall_stats.items():
    
    # Skip if no prediction for player
    if not id_dict[key] in player_ids_test:
        continue
        
    # Add player stats
    p = {"id": key,
         "name": value['web_name'],
         "position": position_mapping[value['element_type']],
         "team": value['team'], 
         "cost": value['now_cost'],
         "play_chance": value['chance_of_playing_next_round']
        }
    
    # Add predicted points
    predicted_index = player_ids_test.index(id_dict[key])
    p["pred_points"] = predictions[predicted_index]
    
    # Add in squad or not
    if key in squad:
        p["in_current_squad"] = 1
    else:
        p["in_current_squad"] = 0
        
    players.append(p)

In [None]:
# Parameters
bank_balance = data['entry_history']['bank']  # Money left in bank
free_transfers = 2  # Number of free transfers available - find out where to use API for this

# Create problem
prob = pulp.LpProblem("FPL_Team_Optimization", pulp.LpMaximize)

# Decision variables
x = {p['id']: pulp.LpVariable(f"x_{p['id']}", cat="Binary") for p in players}  # Squad selection
y = {p['id']: pulp.LpVariable(f"y_{p['id']}", cat="Binary") for p in players}  # Starting 11 selection
extra_transfers = pulp.LpVariable("extra_transfers_var", lowBound=0, cat="Continuous")

# Objective function: Maximize expected points of the starting 11 minus cost of extra transfers
prob += (pulp.lpSum(y[p['id']] * p['pred_points'] for p in players) - 4*extra_transfers)

# Constraints
# Squad size and composition
prob += pulp.lpSum(x[p['id']] for p in players) == 15
prob += pulp.lpSum(x[p['id']] for p in players if p['position'] == "GK") == 2
prob += pulp.lpSum(x[p['id']] for p in players if p['position'] == "DEF") == 5
prob += pulp.lpSum(x[p['id']] for p in players if p['position'] == "MID") == 5
prob += pulp.lpSum(x[p['id']] for p in players if p['position'] == "FWD") == 3

# Max 3 players per team
teams = set(p['team'] for p in players)
for team in teams:
    prob += pulp.lpSum(x[p['id']] for p in players if p['team'] == team) <= 3

# Starting 11 composition
prob += pulp.lpSum(y[p['id']] for p in players) == 11
prob += pulp.lpSum(y[p['id']] for p in players if p['position'] == "GK") == 1
prob += pulp.lpSum(y[p['id']] for p in players if p['position'] == "DEF") >= 3
prob += pulp.lpSum(y[p['id']] for p in players if p['position'] == "MID") >= 2
prob += pulp.lpSum(y[p['id']] for p in players if p['position'] == "FWD") >= 1

# Starting 11 must be in squad
for p in players:
    prob += y[p['id']] <= x[p['id']]

# Transfer cost and budget
total_cost = pulp.lpSum(p['cost'] * (x[p['id']] - p['in_current_squad']) for p in players)
prob += total_cost <= bank_balance

# Add
total_transfers = pulp.lpSum(x[p['id']] for p in players if p['in_current_squad'] == 0)
prob += extra_transfers >= total_transfers - free_transfers
prob += extra_transfers >= 0

# Solve
prob.solve()

# Extract results
selected_squad = [p for p in players if pulp.value(x[p['id']]) > 0.5]
starting_11 = [p for p in players if pulp.value(y[p['id']]) > 0.5]
transfers_in = [p for p in players if pulp.value(x[p['id']]) > 0.5 and p['in_current_squad'] == 0]
transfers_out = [p for p in players if pulp.value(x[p['id']]) < 0.5 and p['in_current_squad'] == 1]

In [None]:
print(f'Transfers in: {list(t["name"] for t in transfers_in)}')
print(f'Transfers out: {list(t["name"] for t in transfers_out)}')

In [None]:
# Prepare a mapping of positions
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
grouped_data = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Populate grouped data based on position
for player in selected_squad:
    player_name = player['name']
    prediction = player['pred_points']
    sum5 = predictions_sum[player_ids_test.index(id_dict[player['id']])]
    fpl_xp = y_test[player_ids_test.index(id_dict[player['id']])]
    player_position = player['position']
    play_chance = player['play_chance']
    grouped_data[player_position].append([player_name, f"{prediction:.2f}", f'{fpl_xp:.2f}', f"{sum5:.2f}", f"{play_chance}"])

# Combine to one table
combined_data = []
for position, players in grouped_data.items():
    combined_data.append([f"--- {position} ---", "", "", ""])
    for player in players:
        combined_data.append(player)

# Print the combined table
print('New squad')
print(tabulate(combined_data, headers=['Player Name', 'Predicted Score', "FPL xP", 'Sum next 5 games',
                                       'Chance of playing'], tablefmt="pretty"))

In [None]:
# Prepare a mapping of positions
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
grouped_data = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Populate grouped data based on position
for player in starting_11:
    player_name = player['name']
    prediction = player['pred_points']
    sum5 = predictions_sum[player_ids_test.index(id_dict[player['id']])]
    fpl_xp = y_test[player_ids_test.index(id_dict[player['id']])]
    player_position = player['position']
    play_chance = player['play_chance']
    grouped_data[player_position].append([player_name, f"{prediction:.2f}", f'{fpl_xp:.2f}', f"{sum5:.2f}", f"{play_chance}"])

# Combine to one table
combined_data = []
for position, players in grouped_data.items():
    combined_data.append([f"--- {position} ---", "", "", ""])
    for player in players:
        combined_data.append(player)

# Print the combined table
print('Starting 11')
print(tabulate(combined_data, headers=['Player Name', 'Predicted Score', "FPL xP", 'Sum next 5 games',
                                       'Chance of playing'], tablefmt="pretty"))

### Test potential points scored if started from beginning

In [44]:
# Load data
df_all = pd.read_csv('data/all_seasons_merged.csv')
df_all = df_all.sort_values(by='kickoff_time').reset_index(drop=True)

# Split into current and previous seasons
df_prev = df_all[df_all.kickoff_date <= '2024-07-01'].drop_duplicates()
df_current = df_all[df_all.kickoff_date >= '2024-07-01'].drop_duplicates()

In [45]:
def one_hot_encode_team(team_id, num_teams):
    # Create a zero array of length num_teams
    one_hot = np.zeros(num_teams)
    
    # Set the correct index to 1 (team_id - 1 because team_id starts from 1)
    one_hot[team_id - 1] = 1

    return one_hot

# Select only numeric columns
numeric_df = df_all.select_dtypes(include='number')

# List of columns to drop
columns_to_drop = [
    'element', 'fixture', 'round', 'GW', 'id',
    'season', 'roster_id', 'player_id', 'team_id', 'opp_team_id',
    'pos_id', 'time',
    'h_goals', 'a_goals', 'team_a_score', 'team_h_score',
    ]
numeric_df = numeric_df.drop(columns=columns_to_drop)

# Choose which features to use for data
keys_to_select = list(numeric_df.columns)

In [54]:
# Load training data
X = np.load('data/xgboost/X_train_old_seasons.npy')
y = np.load('data/xgboost/y_train_old_seasons.npy')
print("Training data (X):", X.shape)
print("Target values (y):", y.shape)

# Split and scale data
y_clipped = np.clip(y, 0, 12)
X_train, X_val, y_train, y_val = train_test_split(X, y_clipped, test_size=0.2)#, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Put more weight on high-scoring players
weights = np.where(y_train > 2, 1, 1.0)

# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=5)
model.fit(X_train, y_train, sample_weight=weights)

# Predict and evaluate
predictions = model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
print(f'Mean Squared Error: {mse:.2f}')

squad = []
bank_balance = 1000
free_transfers = 15
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
total_points = 0


for i in range(1,12):
    print(f'--- Gameweek {i} ---')
    # Get current gameweek
    df_gw = df_current[df_current.GW == i]
    
    # Initialize X,y,played,players
    X_gw = []
    y_gw = []
    played_gw = []
    player_ids = []
    player_names = {}
    player_positions = {}
    player_teams = {}
    player_minutes = {}
    
    # Iterate through all players in that gameweek
    for row in df_gw.iloc:
        df_temp = df_prev[df_prev.name == row['name']][-3:]
        
        # If the player has less than 3 previous appearances, skip
        if len(df_temp) < 3:
            continue
#         if row['minutes'] == 0:
#             continue
            
        player_ids.append(str(row['element']))
        player_names[str(row['element'])] = row['name']
        player_positions[str(row['element'])] = position_mapping[row['pos_id']]
        player_teams[str(row['element'])] = row['team_id']
        player_minutes[str(row['element'])] = row['minutes']
            
        # If player played no minutes in all games
        if row['minutes']+df_temp.iloc[0]['minutes']+df_temp.iloc[1]['minutes']+df_temp.iloc[2]['minutes']==0:
            played_gw.append(0)
        else:
            played_gw.append(1)
            
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(row['team_id'], 27)
        opponent_team = one_hot_encode_team(row['opp_team_id'], 27)
        home_away = one_hot_encode_team(row['was_home'], 2)
        position = one_hot_encode_team(row['pos_id'], 4)
        
        # Combine into one vector
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get points scored (y value)
        points = row['total_points']
        
        # Get player stats from previous games
        for j in range(3):
            row_temp = df_temp.iloc[2-j]

            # Get home/away encoding
            home_away = one_hot_encode_team(row_temp['was_home'], 2)

            # Add player team goals and opponent team goals
            if row_temp.was_home:
                score = np.array(row_temp[['team_h_score', 'team_a_score']].astype(float).values)
            else:
                score = np.array(row_temp[['team_a_score','team_h_score']].astype(float).values)

            # Select the wanted stats and convert to float numpy array
            selected_stats = np.array(row_temp[keys_to_select].astype(float).values) 

            # Combine into one vector
            combined_stats = np.concatenate([combined_stats, home_away, score, selected_stats])

        # Append combined stats to X and points (y value) to y
        X_gw.append(combined_stats)
        y_gw.append(points)
        
    # Convert lists to numpy arrays for training/testing
    X_gw = np.array(X_gw)
    y_gw = np.array(y_gw)
    played_gw = np.array(played_gw)
    print(X_gw.shape)
    
    
    # Make predictions
    X_test = scaler.transform(X_gw)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_gw, predictions)
    print(f'Mean Squared Error: {mse:.2f}')

    # Populate player info
    players = []
    for key in player_ids:
        p = {"id": key,
             "name": player_names[key],
             "position": player_positions[key],
             "team": player_teams[key],
             "cost": df_gw[df_gw.element == int(key)].iloc[0]['value'],
             "play_chance": player_minutes[key],
             "pred_points": predictions[player_ids.index(key)],
             "actual_points": df_gw[df_gw.element == int(key)].iloc[0]['total_points'],
            }
        
        # Add in squad or not
        if key in squad:
            p["in_current_squad"] = 1
        else:
            p["in_current_squad"] = 0
            
        # Update prediction if the player played 0 minutes
        if p['play_chance'] == 0:
            p['pred_points'] = 0
        players.append(p)
    
    # Create problem
    prob = pulp.LpProblem("FPL_Team_Optimization", pulp.LpMaximize)

    # Decision variables
    a = {p['id']: pulp.LpVariable(f"a_{p['id']}", cat="Binary") for p in players}  # Squad selection
    b = {p['id']: pulp.LpVariable(f"b_{p['id']}", cat="Binary") for p in players}  # Starting 11 selection
    extra_transfers = pulp.LpVariable("extra_transfers_var", lowBound=0, cat="Continuous")

    # Objective function: Maximize expected points of the starting 11 minus cost of extra transfers
    prob += (pulp.lpSum(b[p['id']] * p['pred_points'] for p in players) - 4 * extra_transfers)

    # Constraints
    # Squad size and composition
    prob += pulp.lpSum(a[p['id']] for p in players) == 15
    prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "GK") == 2
    prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "DEF") == 5
    prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "MID") == 5
    prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "FWD") == 3

    # Max 3 players per team
    teams = set(p['team'] for p in players)
    for team in teams:
        prob += pulp.lpSum(a[p['id']] for p in players if p['team'] == team) <= 3

    # Starting 11 composition
    prob += pulp.lpSum(b[p['id']] for p in players) == 11
    prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "GK") == 1
    prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "DEF") >= 3
    prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "MID") >= 2
    prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "FWD") >= 1

    # Starting 11 must be in squad
    for p in players:
        prob += b[p['id']] <= a[p['id']]

    # Transfer cost and budget
    total_cost = pulp.lpSum(p['cost'] * (a[p['id']] - p['in_current_squad']) for p in players)
    prob += total_cost <= bank_balance

    # Add
    total_transfers = pulp.lpSum(a[p['id']] for p in players if p['in_current_squad'] == 0)
    prob += extra_transfers >= total_transfers - free_transfers
    prob += extra_transfers >= 0

    # Solve
    prob.solve()

    
    if prob.status != pulp.LpStatusOptimal:
        raise ValueError("Optimization did not find a feasible solution.")

    # Extract results
    selected_squad = [p for p in players if pulp.value(a[p['id']]) > 0.5]
    starting_11 = [p for p in players if pulp.value(b[p['id']]) > 0.5]
    transfers_in = [p for p in players if pulp.value(a[p['id']]) > 0.5 and p['in_current_squad'] == 0]
    transfers_out = [p for p in players if pulp.value(a[p['id']]) < 0.5 and p['in_current_squad'] == 1]

    
    # Print results
    print(f'Transfers in: {list(t["name"] for t in transfers_in)}')
    print(f'Transfers out: {list(t["name"] for t in transfers_out)}')
    
    # Prepare a mapping of positions
    position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
    grouped_data = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

    # Populate grouped data based on position
    for player in selected_squad:
        player_name = player['name']
        prediction = player['pred_points']
        actual_points = player['actual_points']
        player_position = player['position']
        play_chance = player['play_chance']
        grouped_data[player_position].append([player_name, f"{prediction:.2f}", f'{actual_points}', f"{play_chance}"])

    # Combine to one table
    combined_data = []
    for position, players_pos in grouped_data.items():
        combined_data.append([f"--- {position} ---", "", "", ""])
        for player in players_pos:
            combined_data.append(player)

    # Print the combined table
    print('New squad')
    print(tabulate(combined_data,headers=['Player Name', 'Predicted Score', 'Actual points', 'Chance of playing'],
                   tablefmt="pretty"))
    
    # Prepare a mapping of positions
    position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
    grouped_data = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

    # Populate grouped data based on position
    for player in starting_11:
        player_name = player['name']
        prediction = player['pred_points']
        actual_points = player['actual_points']
        player_position = player['position']
        play_chance = player['play_chance']
        grouped_data[player_position].append([player_name, f"{prediction:.2f}", f'{actual_points}', f"{play_chance}"])

    # Combine to one table
    combined_data = []
    for position, players_pos in grouped_data.items():
        combined_data.append([f"--- {position} ---", "", "", ""])
        for player in players_pos:
            combined_data.append(player)

    # Print the combined table
    print('Starting 11')
    print(tabulate(combined_data,headers=['Player Name', 'Predicted Score', 'Actual points', 'Chance of playing'],
                   tablefmt="pretty"))
    
    
    
    # Update parameters
    for player in transfers_in:
        bank_balance -= player['cost']
        free_transfers -= 1
        squad.append(player['id'])
    for player in transfers_out:
        bank_balance += player['cost']
        squad.remove(player['id'])
        
    free_transfers = min(max(free_transfers, 0) + 1, 5)
    
    print(f'New bank balance = {bank_balance}')
    print(f'New free transfers = {free_transfers}')
    print(f'Squad size = {len(squad)}')
    
    highest_prediction = 0
    for player in starting_11:
        if player['pred_points'] > highest_prediction:
            highest_prediction = player['pred_points']
    
    # Calculate points
    gw_points = 0
    for player in starting_11:
        player_points = df_gw[df_gw.element == int(player['id'])].iloc[0]['total_points']
        if player['pred_points'] == highest_prediction:
            print(f'Captain is {player["name"]}')
            player_points *= 2
        gw_points += player_points
    
    print(f'Total points in gameweek: {gw_points}')
    total_points += gw_points
    
    
    # Train new model with added data
    X = np.concatenate((X, X_gw), axis=0)
    y = np.concatenate((y, y_gw), axis=0)
    print("Training data (X):", X.shape)
    print("Target values (y):", y.shape)
    
    # Split the data into training and testing sets
    y_clipped = np.clip(y, 0, 12)
    X_train, X_val, y_train, y_val = train_test_split(X, y_clipped, test_size=0.2)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Put more weight on high-scoring players
    weights = np.where(y_train > 2, 1, 1.0)

    # Train model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=5)
    model.fit(X_train, y_train, sample_weight=weights)

    # Predict and evaluate
    predictions = model.predict(X_val)
    mse = mean_squared_error(y_val, predictions)
    print(f'Mean Squared Error: {mse:.2f}')
    
        
    print('\n\n')

Training data (X): (76000, 195)
Target values (y): (76000,)
Mean Squared Error: 5.35
--- Gameweek 1 ---
(344, 195)
Mean Squared Error: 6.34
Transfers in: ['Harry Wilson', 'Andrew Robertson', 'Bukayo Saka', 'Mason Holgate', 'Dan Burn', 'Jordan Pickford', 'Alexander Isak', 'Jacob Murphy', 'Michael Keane', 'William Saliba', 'Daniel Jebbison', 'Erling Haaland', 'Phil Foden', 'Kepa Arrizabalaga', 'James Maddison']
Transfers out: []
New squad
+-------------------+-----------------+---------------+-------------------+
|    Player Name    | Predicted Score | Actual points | Chance of playing |
+-------------------+-----------------+---------------+-------------------+
|    --- GK ---     |                 |               |                   |
|  Jordan Pickford  |      4.12       |       1       |        90         |
| Kepa Arrizabalaga |      0.00       |       0       |         0         |
|    --- DEF ---    |                 |               |                   |
| Andrew Robertson  |      

Mean Squared Error: 5.28



--- Gameweek 4 ---
(351, 195)
Mean Squared Error: 8.10
Transfers in: ['Cristian Romero']
Transfers out: ['Dan Burn']
New squad
+------------------------------------+-----------------+---------------+-------------------+
|            Player Name             | Predicted Score | Actual points | Chance of playing |
+------------------------------------+-----------------+---------------+-------------------+
|             --- GK ---             |                 |               |                   |
|          Jordan Pickford           |      3.94       |       2       |        90         |
|         Kepa Arrizabalaga          |      0.00       |       0       |         0         |
|            --- DEF ---             |                 |               |                   |
|          Andrew Robertson          |      4.62       |       1       |        74         |
|           Joško Gvardiol           |      4.01       |       1       |        45         |
|       

Mean Squared Error: 5.28



--- Gameweek 6 ---
(351, 195)
Mean Squared Error: 9.19
Transfers in: ['Kai Havertz']
Transfers out: ['Alexander Isak']
New squad
+------------------------------------+-----------------+---------------+-------------------+
|            Player Name             | Predicted Score | Actual points | Chance of playing |
+------------------------------------+-----------------+---------------+-------------------+
|             --- GK ---             |                 |               |                   |
|          Jordan Pickford           |      4.12       |       2       |        90         |
|         Kepa Arrizabalaga          |      3.45       |       2       |        90         |
|            --- DEF ---             |                 |               |                   |
|           Joško Gvardiol           |      4.24       |       9       |        90         |
|           Mason Holgate            |      0.00       |       0       |         0         |
|     

Mean Squared Error: 5.39



--- Gameweek 8 ---
(351, 195)
Mean Squared Error: 7.83
Transfers in: ['Son Heung-Min']
Transfers out: ['Bukayo Saka']
New squad
+------------------------------------+-----------------+---------------+-------------------+
|            Player Name             | Predicted Score | Actual points | Chance of playing |
+------------------------------------+-----------------+---------------+-------------------+
|             --- GK ---             |                 |               |                   |
|         Kepa Arrizabalaga          |      3.34       |       6       |        90         |
|             Matz Sels              |      3.75       |      11       |        90         |
|            --- DEF ---             |                 |               |                   |
|          Cristian Romero           |      4.14       |       2       |        90         |
|           Mason Holgate            |      0.00       |       0       |         0         |
|      

Mean Squared Error: 5.32



--- Gameweek 10 ---
(351, 195)
Mean Squared Error: 6.79
Transfers in: ['Bruno Guimarães Rodriguez Moura', 'Morgan Gibbs-White', 'Virgil van Dijk']
Transfers out: ['Jacob Murphy', 'Andrew Robertson', 'James Maddison']
New squad
+------------------------------------+-----------------+---------------+-------------------+
|            Player Name             | Predicted Score | Actual points | Chance of playing |
+------------------------------------+-----------------+---------------+-------------------+
|             --- GK ---             |                 |               |                   |
|         Kepa Arrizabalaga          |      0.00       |       0       |         0         |
|             Matz Sels              |      3.76       |       6       |        90         |
|            --- DEF ---             |                 |               |                   |
|           William Saliba           |      4.61       |       2       |        90         |
|

Mean Squared Error: 5.36





In [55]:
total_points

524

In [67]:
weight_3 = []

for _ in range(10):
    # Load training data
    X = np.load('data/xgboost/X_train_old_seasons.npy')
    y = np.load('data/xgboost/y_train_old_seasons.npy')

    # Split and scale data
    y_clipped = np.clip(y, 0, 12)
    X_train, X_val, y_train, y_val = train_test_split(X, y_clipped, test_size=0.2)#, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Put more weight on high-scoring players
    weights = np.where(y_train > 2, 3, 1.0)

    # Train model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=5)
    model.fit(X_train, y_train, sample_weight=weights)

    # Predict and evaluate
    predictions = model.predict(X_val)
    mse = mean_squared_error(y_val, predictions)

    squad = []
    bank_balance = 1000
    free_transfers = 15
    position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
    total_points = 0


    for i in range(1,12):
        # Get current gameweek
        df_gw = df_current[df_current.GW == i]

        # Initialize X,y,played,players
        X_gw = []
        y_gw = []
        played_gw = []
        player_ids = []
        player_names = {}
        player_positions = {}
        player_teams = {}
        player_minutes = {}

        # Iterate through all players in that gameweek
        for row in df_gw.iloc:
            df_temp = df_prev[df_prev.name == row['name']][-3:]

            # If the player has less than 3 previous appearances, skip
            if len(df_temp) < 3:
                continue

            player_ids.append(str(row['element']))
            player_names[str(row['element'])] = row['name']
            player_positions[str(row['element'])] = position_mapping[row['pos_id']]
            player_teams[str(row['element'])] = row['team_id']
            player_minutes[str(row['element'])] = row['minutes']

            # If player played no minutes in all games
            if row['minutes']+df_temp.iloc[0]['minutes']+df_temp.iloc[1]['minutes']+df_temp.iloc[2]['minutes']==0:
                played_gw.append(0)
            else:
                played_gw.append(1)

            # Get one-hot encoding for players team, opp team, home/away and position
            player_team = one_hot_encode_team(row['team_id'], 27)
            opponent_team = one_hot_encode_team(row['opp_team_id'], 27)
            home_away = one_hot_encode_team(row['was_home'], 2)
            position = one_hot_encode_team(row['pos_id'], 4)

            # Combine into one vector
            combined_stats = np.concatenate([player_team, opponent_team, home_away, position])

            # Get points scored (y value)
            points = row['total_points']

            # Get player stats from previous games
            for j in range(3):
                row_temp = df_temp.iloc[2-j]

                # Get home/away encoding
                home_away = one_hot_encode_team(row_temp['was_home'], 2)

                # Add player team goals and opponent team goals
                if row_temp.was_home:
                    score = np.array(row_temp[['team_h_score', 'team_a_score']].astype(float).values)
                else:
                    score = np.array(row_temp[['team_a_score','team_h_score']].astype(float).values)

                # Select the wanted stats and convert to float numpy array
                selected_stats = np.array(row_temp[keys_to_select].astype(float).values) 

                # Combine into one vector
                combined_stats = np.concatenate([combined_stats, home_away, score, selected_stats])

            # Append combined stats to X and points (y value) to y
            X_gw.append(combined_stats)
            y_gw.append(points)

        # Convert lists to numpy arrays for training/testing
        X_gw = np.array(X_gw)
        y_gw = np.array(y_gw)
        played_gw = np.array(played_gw)

        # Make predictions
        X_test = scaler.transform(X_gw)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_gw, predictions)

        # Populate player info
        players = []
        for key in player_ids:
            p = {"id": key,
                 "name": player_names[key],
                 "position": player_positions[key],
                 "team": player_teams[key],
                 "cost": df_gw[df_gw.element == int(key)].iloc[0]['value'],
                 "play_chance": player_minutes[key],
                 "pred_points": predictions[player_ids.index(key)],
                 "actual_points": df_gw[df_gw.element == int(key)].iloc[0]['total_points'],
                }

            # Add in squad or not
            if key in squad:
                p["in_current_squad"] = 1
            else:
                p["in_current_squad"] = 0

            # Update prediction if the player played 0 minutes
            if p['play_chance'] == 0:
                p['pred_points'] = 0
            players.append(p)

        # Create problem
        prob = pulp.LpProblem("FPL_Team_Optimization", pulp.LpMaximize)

        # Decision variables
        a = {p['id']: pulp.LpVariable(f"a_{p['id']}", cat="Binary") for p in players}  # Squad selection
        b = {p['id']: pulp.LpVariable(f"b_{p['id']}", cat="Binary") for p in players}  # Starting 11 selection
        extra_transfers = pulp.LpVariable("extra_transfers_var", lowBound=0, cat="Continuous")

        # Objective function: Maximize expected points of the starting 11 minus cost of extra transfers
        prob += (pulp.lpSum(b[p['id']] * p['pred_points'] for p in players) - 4 * extra_transfers)

        # Constraints
        # Squad size and composition
        prob += pulp.lpSum(a[p['id']] for p in players) == 15
        prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "GK") == 2
        prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "DEF") == 5
        prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "MID") == 5
        prob += pulp.lpSum(a[p['id']] for p in players if p['position'] == "FWD") == 3

        # Max 3 players per team
        teams = set(p['team'] for p in players)
        for team in teams:
            prob += pulp.lpSum(a[p['id']] for p in players if p['team'] == team) <= 3

        # Starting 11 composition
        prob += pulp.lpSum(b[p['id']] for p in players) == 11
        prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "GK") == 1
        prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "DEF") >= 3
        prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "MID") >= 2
        prob += pulp.lpSum(b[p['id']] for p in players if p['position'] == "FWD") >= 1

        # Starting 11 must be in squad
        for p in players:
            prob += b[p['id']] <= a[p['id']]

        # Transfer cost and budget
        total_cost = pulp.lpSum(p['cost'] * (a[p['id']] - p['in_current_squad']) for p in players)
        prob += total_cost <= bank_balance

        # Add
        total_transfers = pulp.lpSum(a[p['id']] for p in players if p['in_current_squad'] == 0)
        prob += extra_transfers >= total_transfers - free_transfers
        prob += extra_transfers >= 0

        # Solve
        prob.solve()


        if prob.status != pulp.LpStatusOptimal:
            raise ValueError("Optimization did not find a feasible solution.")

        # Extract results
        selected_squad = [p for p in players if pulp.value(a[p['id']]) > 0.5]
        starting_11 = [p for p in players if pulp.value(b[p['id']]) > 0.5]
        transfers_in = [p for p in players if pulp.value(a[p['id']]) > 0.5 and p['in_current_squad'] == 0]
        transfers_out = [p for p in players if pulp.value(a[p['id']]) < 0.5 and p['in_current_squad'] == 1]


        # Update parameters
        for player in transfers_in:
            bank_balance -= player['cost']
            free_transfers -= 1
            squad.append(player['id'])
        for player in transfers_out:
            bank_balance += player['cost']
            squad.remove(player['id'])

        free_transfers = min(max(free_transfers, 0) + 1, 5)

        highest_prediction = 0
        for player in starting_11:
            if player['pred_points'] > highest_prediction:
                highest_prediction = player['pred_points']

        # Calculate points
        gw_points = 0
        for player in starting_11:
            player_points = df_gw[df_gw.element == int(player['id'])].iloc[0]['total_points']
            if player['pred_points'] == highest_prediction:
                player_points *= 2
            gw_points += player_points

        total_points += gw_points


        # Train new model with added data
        X = np.concatenate((X, X_gw), axis=0)
        y = np.concatenate((y, y_gw), axis=0)

        # Split the data into training and testing sets
        y_clipped = np.clip(y, 0, 12)
        X_train, X_val, y_train, y_val = train_test_split(X, y_clipped, test_size=0.2)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # Put more weight on high-scoring players
        weights = np.where(y_train > 2, 3, 1.0)

        # Train model
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=5)
        model.fit(X_train, y_train, sample_weight=weights)

        # Predict and evaluate
        predictions = model.predict(X_val)
        mse = mean_squared_error(y_val, predictions)
    
    print(total_points)
    weight_3.append(total_points)

617
529
682
664
611
606
522
575
627
677


In [64]:
weight_1, np.mean(weight_1), np.std(weight_1)

([604, 556, 565, 515, 558, 614, 523, 575, 685, 672], 586.7, 54.36552216248824)

In [65]:
weight_15, np.mean(weight_15), np.std(weight_15)

([531, 588, 601, 562, 588, 634, 563, 589, 583, 621], 586.0, 28.124722220850465)

In [66]:
weight_2, np.mean(weight_2), np.std(weight_2)

([633, 541, 572, 558, 569, 575, 576, 602, 587, 626], 583.9, 27.4534150881088)

In [68]:
weight_3, np.mean(weight_3), np.std(weight_3)

([617, 529, 682, 664, 611, 606, 522, 575, 627, 677], 611.0, 53.3516635167077)