In [1]:
import json
import requests
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers, layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Fantasy Premier League (FPL) Player Data Breakdown

### 1. Player Availability & Status
- **chance\_of\_playing\_next\_round**: Probability (percentage) that the player will play in the next round (gameweek).
- **chance\_of\_playing\_this\_round**: Probability (percentage) that the player will play in the current round.
- **status**: Player’s current status, where `a` means available, `i` means injured, `d` means doubtful, etc.
- **news**: Relevant news about the player (e.g., injuries).
- **news\_added**: Timestamp indicating when the player's news was last updated.

### 2. Player Identification
- **id**: Unique identifier for the player within the FPL system.
- **code**: Another internal player identifier used by FPL.
- **first\_name**: Player’s first name.
- **second\_name**: Player’s last name.
- **web\_name**: The name displayed on FPL, typically a simplified form, e.g., G. Jesus.
- **team**: The ID of the Premier League team the player belongs to.
- **team\_code**: Alternative code for the player’s team (linked to `team`).
- **element\_type**: The player’s position:
  - 1: Goalkeeper
  - 2: Defender
  - 3: Midfielder
  - 4: Forward

### 3. Player Price & Transfers
- **now\_cost**: Player’s current price in FPL (multiplied by 10). For example, 68 means £6.8m.
- **cost\_change\_start**: Change in player’s price since the start of the season.
- **cost\_change\_start\_fall**: Total decrease in price since the start of the season.
- **cost\_change\_event**: Change in price in the current gameweek.
- **cost\_change\_event\_fall**: Decrease in price in the current gameweek.
- **transfers\_in**: Total number of transfers in for this player during the season.
- **transfers\_in\_event**: Transfers in during the current gameweek.
- **transfers\_out**: Total number of transfers out during the season.
- **transfers\_out\_event**: Transfers out in the current gameweek.

### 4. Performance in Current Season
- **total\_points**: Total points the player has accumulated this season.
- **points\_per\_game**: Average points scored per game played.
- **form**: Player’s recent form over the last 30 days.
- **ep\_next**: Expected points in the next round.
- **ep\_this**: Expected points in the current round.
- **value\_form**: Value metric combining form and price (form per £m).
- **value\_season**: Player’s season value based on points per £m.
- **event\_points**: Points scored in the most recent gameweek.

### 5. Player Game Statistics
- **minutes**: Total minutes played.
- **goals\_scored**: Total number of goals scored.
- **assists**: Total number of assists provided.
- **clean\_sheets**: Total clean sheets (mainly for defenders and goalkeepers).
- **goals\_conceded**: Goals conceded (defenders and goalkeepers).
- **own\_goals**: Own goals scored.
- **penalties\_saved**: Penalties saved (for goalkeepers).
- **penalties\_missed**: Penalties missed.
- **yellow\_cards**: Total yellow cards received.
- **red\_cards**: Total red cards received.
- **saves**: Total number of saves (for goalkeepers).
- **bonus**: Bonus points earned.
- **bps**: Bonus Points System score used for calculating bonus points.

### 6. Influence, Creativity, Threat, and ICT Index
These metrics come from FPL’s proprietary system to quantify player contributions:

- **influence**: Measures player’s impact on the game (e.g., goals, assists, key passes).
- **creativity**: Player’s ability to create goal-scoring opportunities (e.g., key passes).
- **threat**: Player’s likelihood of scoring goals (e.g., shots taken).
- **ict\_index**: Combines Influence, Creativity, and Threat into one metric.

### 7. Gameweek Performance Statistics
- **starts**: Number of games started.
- **expected\_goals**: Expected goals based on chances.
- **expected\_assists**: Expected assists based on chances created.
- **expected\_goal\_involvements**: Expected goals + assists.
- **expected\_goals\_conceded**: Expected goals conceded.

### 8. Per 90 Metrics
These metrics scale player stats to per-90-minute intervals:

- **expected\_goals\_per\_90**: Expected goals per 90 minutes.
- **expected\_assists\_per\_90**: Expected assists per 90 minutes.
- **expected\_goal\_involvements\_per\_90**: Expected goal involvements per 90 minutes.
- **expected\_goals\_conceded\_per\_90**: Expected goals conceded per 90 minutes.
- **goals\_conceded\_per\_90**: Goals conceded per 90 minutes.
- **saves\_per\_90**: Saves per 90 minutes (for goalkeepers).
- **starts\_per\_90**: Starts per 90 minutes.

### 9. Rankings
Rankings compare the player to others in similar positions or across the entire league:

- **influence\_rank**: Overall influence rank among all players.
- **influence\_rank\_type**: Influence rank among players in the same position.
- **creativity\_rank**: Overall creativity rank among all players.
- **creativity\_rank\_type**: Creativity rank among players in the same position.
- **threat\_rank**: Overall threat rank.
- **threat\_rank\_type**: Threat rank among players in the same position.
- **ict\_index\_rank**: Overall ICT Index rank.
- **ict\_index\_rank\_type**: ICT Index rank among players in the same position.
- **now\_cost\_rank**: Rank based on player’s price compared to all players.
- **now\_cost\_rank\_type**: Cost rank among players in the same position.
- **form\_rank**: Form rank compared to all players.
- **form\_rank\_type**: Form rank among players in the same position.
- **points\_per\_game\_rank**: Rank based on points per game.
- **points\_per\_game\_rank\_type**: Points per game rank among players in the same position.
- **selected\_rank**: Rank based on the percentage of managers who own the player.
- **selected\_rank\_type**: Rank among other players in the same position.

### 10. Set Piece Information
These fields relate to whether the player is involved in set-pieces (corners, free kicks, penalties):

- **corners\_and\_indirect\_freekicks\_order**: The player's order in taking corners and indirect free kicks.
- **corners\_and\_indirect\_freekicks\_text**: Notes on the player's role in taking set pieces.
- **direct\_freekicks\_order**: The player's order in taking direct free kicks.
- **direct\_freekicks\_text**: Notes on the player's role in taking direct free kicks.
- **penalties\_order**: The player's order in taking penalties.
- **penalties\_text**: Notes on the player's role in taking penalties.


# Load data previous seasons

### 2023-24

In [2]:
df_teams_23 = pd.read_csv("data/2023-24/teams.csv")
team_dict_23 = {}
for row in df_teams_23.iloc:
    team_dict_23[row['id']] = row['name']

In [3]:
df_2324 = pd.read_csv("data/2023-24/gws/merged_gw.csv")
df_2324['kickoff_date'] = pd.to_datetime(df_2324['kickoff_time']).dt.date
df_2324['name'] = df_2324['name'].map({'Đorđe Petrović': 'Djordje Petrovic'}).fillna(df_2324['name'])
df_2324['opponent_team'] = df_2324['opponent_team'].map(team_dict_23).fillna(df_2324['opponent_team'])
print(len(df_2324))
df_2324.head()

29725


Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,kickoff_date
0,Femi Seriki,DEF,Sheffield Utd,0.5,0,0,0,0,0.0,653,...,0.0,0,0,0,0,40,True,0,1,2023-08-12
1,Jack Hinshelwood,MID,Brighton,1.5,0,0,0,0,0.0,621,...,0.0,0,0,0,0,45,True,0,1,2023-08-12
2,Jadon Sancho,MID,Man Utd,3.0,0,0,4,0,11.3,397,...,8.0,1,0,0,0,70,True,0,1,2023-08-14
3,Rhys Norrington-Davies,DEF,Sheffield Utd,0.1,0,0,0,0,0.0,487,...,0.0,0,0,0,0,40,True,0,1,2023-08-12
4,Vitaly Janelt,MID,Brentford,2.1,0,0,6,0,11.5,105,...,17.0,2,0,0,0,55,True,0,1,2023-08-13


In [4]:
player_mapping = {
    'Aleksandar Mitrovic': 'Aleksandar Mitrović',
    'Álex Moreno': 'Alexandre Moreno Lopera',
    'Alisson': 'Alisson Ramses Becker',
    'Amad Diallo Traore': 'Amad Diallo',
    'Amari&#039;i Bell': "Amari'i Bell",
    'Ameen Al Dakhil': 'Ameen Al-Dakhil',
    'Andreas Pereira': 'Andreas Hoelgebaum Pereira',
    'André Gomes': 'André Tavares Gomes',
    'Anel Ahmedhodzic': 'Anel Ahmedhodžić',
    'Anis Ben Slimane': 'Anis Slimane',
    'Anssumane Fati': 'Anssumane Fati Vieira',
    'Antony': 'Antony Matheus dos Santos',
    'Ben Brereton Díaz': 'Ben Brereton',
    'Ben White': 'Benjamin White',
    'Benoit Badiashile Mukinayi': 'Benoît Badiashile',
    'Benson Manuel': 'Manuel Benson Hedilazio',
    'Bernardo Silva': 'Bernardo Veiga de Carvalho e Silva',
    'Beto': 'Norberto Bercique Gomes Betuncal',
    'Bobby Reid': 'Bobby De Cordova-Reid',
    'Boubacar Traore': 'Boubacar Traoré',
    'Brandon Aguilera': 'Brandon Aguilera Zamora',
    'Bruno Fernandes': 'Bruno Borges Fernandes',
    'Bruno Guimarães': 'Bruno Guimarães Rodriguez Moura',
    'Carlos Vinicius': 'Carlos Vinícius Alves Morais',
    'Casemiro': 'Carlos Henrique Casimiro',
    'Cheick Oumar Doucoure': 'Cheick Doucouré',
    'Chimuanya Ugochukwu': 'Lesley Ugochukwu',
    'Clement Lenglet': 'Clément Lenglet',
    'Cédric Soares': 'Cédric Alves Soares',
    'Danilo': 'Danilo dos Santos de Oliveira',
    'Dara O\'Shea': 'Dara O\'Shea',
    'Dara O&#039;Shea': "Dara O'Shea",
    'Darwin Núñez': 'Darwin Núñez Ribeiro',
    'David Raya': 'David Raya Martin',
    'Deivid Washington': 'Deivid Washington de Souza Eugênio',
    'Destiny Udogie': 'Iyenoma Destiny Udogie',
    'Diego Carlos': 'Diego Carlos Santos Silva',
    'Diogo Dalot': 'Diogo Dalot Teixeira',
    'Diogo Jota': 'Diogo Teixeira da Silva',
    'Douglas Luiz': 'Douglas Luiz Soares de Paulo',
    'Ederson': 'Ederson Santana de Moraes',
    'Edson Álvarez': 'Edson Álvarez Velázquez',
    'Emiliano Martinez': 'Emiliano Martínez Romero',
    'Emile Smith-Rowe': 'Emile Smith Rowe',
    'Estupiñán': 'Pervis Estupiñán',
    'Fabio Silva': 'Fábio Silva',
    'Fábio Vieira': 'Fábio Ferreira Vieira',
    'Facundo Pellistri': 'Facundo Pellistri Rebollo',
    'Felipe': 'Felipe Augusto de Almeida Monteiro',
    'Fode Toure': 'Fodé Ballo-Touré',
    'Gabriel': 'Gabriel dos Santos Magalhães',
    'Gabriel Jesus': 'Gabriel Fernando de Jesus',
    'Gabriel Martinelli': 'Gabriel Martinelli Silva',
    'Hee-Chan Hwang': 'Hwang Hee-chan',
    'Hugo Bueno': 'Hugo Bueno López',
    'Ibrahim Sangare': 'Ibrahim Sangaré',
    'Igor Julio': 'Igor Julio dos Santos de Paulo',
    'Ionut Radu': 'Ionuț Radu',
    'Issa Kabore': 'Issa Kaboré',
    'Ivan Perisic': 'Ivan Perišić',
    'Iyenoma Destiny Udogie': 'Destiny Udogie',
    'Jack Colback': 'Jack Colback',
    'Jefferson Lerma': 'Jefferson Lerma Solís',
    'Jéremy Doku': 'Jérémy Doku',
    'João Gomes': 'João Victor Gomes da Silva',
    'João Palhinha': 'João Palhinha Gonçalves',
    'João Pedro': 'João Pedro Junqueira de Jesus',
    'Joelinton': 'Joelinton Cássio Apolinário de Lira',
    'Johann Berg Gudmundsson': 'Jóhann Berg Gudmundsson',
    'José Sá': 'José Malheiro de Sá',
    'Josko Gvardiol': 'Joško Gvardiol',
    'Jorginho': 'Jorge Luiz Frello Filho',
    'Joseph Gomez': 'Joe Gomez',
    'Kaine Hayden': 'Kaine Kesler-Hayden',
    'Louis Beyer': 'Jordan Beyer',
    'Lucas Paquetá': 'Lucas Tolentino Coelho de Lima',
    'Mads Andersen': 'Mads Juel Andersen',
    'Mads Roerslev': 'Mads Roerslev Rasmussen',
    'Marc Cucurella': 'Marc Cucurella Saseta',
    'Marc Guehi': 'Marc Guéhi',
    'Martin Odegaard': 'Martin Ødegaard',
    'Mateo Kovacic': 'Mateo Kovačić',
    'Matheus Cunha': 'Matheus França de Oliveira',
    'Matheus França': 'Matheus Santos Carneiro Da Cunha',
    'Matheus Nunes': 'Matheus Luiz Nunes',
    'Matthew Cash': 'Matty Cash',
    'Maxime Estève': 'Maxime Esteve',
    'Miguel Almirón': 'Miguel Almirón Rejala',
    'Moisés Caicedo': 'Moisés Caicedo Corozo',
    'Moussa Niakhate': 'Moussa Niakhaté',
    'Murillo': 'Murillo Santiago Costa dos Santos',
    'Naif Aguerd': 'Nayef Aguerd',
    'Nélson Semedo': 'Nélson Cabral Semedo',
    'Neto': 'Norberto Murara Neto',
    'Nicolo Zaniolo': 'Nicolò Zaniolo',
    'Nuno Tavares': 'Nuno Varela Tavares',
    'Odysseas Vlachodimos': 'Odysseas Vlachodimos',
    'Odisseas Vlachodimos': 'Odysseas Vlachodimos',
    'Ola Aina': 'Olu Aina',
    'Pablo Fornals': 'Pablo Fornals Malla',
    'Pape Sarr': 'Pape Matar Sarr',
    'Pedro Neto': 'Pedro Lomba Neto',
    'Philippe Coutinho': 'Philippe Coutinho Correia',
    'Raphael Varane': 'Raphaël Varane',
    'Rayan Ait Nouri': 'Rayan Aït-Nouri',
    'Richarlison': 'Richarlison de Andrade',
    'Rodri': 'Rodrigo Hernandez',
    'Rodrigo Muniz': 'Rodrigo Muniz Carvalho',
    'Romeo Lavia': 'Roméo Lavia',
    'Rúben Dias': 'Rúben Gato Alves Dias',
    'Ryan John Giles': 'Ryan Giles',
    'Said Benrahma': 'Saïd Benrahma',
    'Sasa Lukic': 'Saša Lukić',
    'Son Heung-Min': 'Son Heung-min',
    'Thiago Alcántara': 'Thiago Alcántara do Nascimento',
    'Thiago Silva': 'Thiago Emiliano da Silva',
    'Tomas Soucek': 'Tomáš Souček',
    'Toti': 'Toti António Gomes',
    'Valentino Livramento': 'Tino Livramento',
    'Victor Kristiansen': 'Victor da Silva',
    'Vinicius Souza': 'Vini de Souza Costa',
    'Vladimir Coufal': 'Vladimír Coufal',
    'Wilfred Ndidi': 'Wilfred Ndidi',
    'Willian': 'Willian Borges da Silva',
    'Yehor Yarmolyuk': 'Yegor Yarmoliuk',
    'Youssef Chermiti': 'Youssef Ramalho Chermiti',
    'Zanka': 'Mathias Jorgensen'
}

In [5]:
# Specify the folder containing your CSV files
folder_path = 'data/2023-24/understat'

# Create an empty list to store individual dataframes
dataframes = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV into a dataframe
        df = pd.read_csv(file_path)
        
        # Extract the player name from the filename (remove .csv)
        player_file_name = filename[:-4]
        
        # Add a new column 'Player' with the player's name
        player_name = (' ').join(player_file_name.split('_')[:-1])
        player_id = player_file_name.split('_')[-1]
        df['name'] = player_name
        df['player_id'] = player_id
        
        # Hardcode for the two Emersons
        if player_name == 'Emerson':
            if player_id == '1245':
                df['name'] = 'Emerson Palmieri dos Santos'
            elif player_id == '7430':
                df['name'] = 'Emerson Leite de Souza Junior'
        
        # Append the dataframe to the list
        dataframes.append(df)

# Combine all the dataframes into one large dataframe
df_understat_23 = pd.concat(dataframes, ignore_index=True)
df_understat_23 = df_understat_23.drop(['assists', 'position'], axis=1)
df_understat_23['date'] = pd.to_datetime(df_understat_23['date']).dt.date
df_understat_23['name'] = df_understat_23['name'].map(player_mapping).fillna(df_understat_23['name'])
df_understat_23.head()

Unnamed: 0,goals,shots,xG,time,h_team,a_team,h_goals,a_goals,date,id,season,roster_id,xA,key_passes,npg,npxG,xGChain,xGBuildup,name,player_id
0,0,0,0.0,90,Manchester City,West Ham,3,1,2024-05-19,22273,2023,663005,0.050117,1,0,0.0,0.050117,0.050117,Aaron Cresswell,534
1,0,0,0.0,1,West Ham,Luton,3,1,2024-05-11,22263,2023,660607,0.0,0,0,0.0,0.0,0.0,Aaron Cresswell,534
2,0,0,0.0,11,Chelsea,West Ham,5,0,2024-05-05,22249,2023,659355,0.0,0,0,0.0,0.0,0.0,Aaron Cresswell,534
3,0,0,0.0,37,Crystal Palace,West Ham,5,2,2024-04-21,22227,2023,655528,0.0,0,0,0.0,0.0,0.0,Aaron Cresswell,534
4,0,0,0.0,34,Wolverhampton Wanderers,West Ham,1,2,2024-04-06,22214,2023,651828,0.0,0,0,0.0,0.0,0.0,Aaron Cresswell,534


In [6]:
df_merged_23 = pd.merge(df_2324, df_understat_23, how='left', left_on=['name', 'kickoff_date'], right_on=['name', 'date'])
print(len(df_merged_23), df_merged_23.goals.isna().sum())

29725 18318


In [7]:
# Remove players who play less than 5 games all season
for player in sorted(df_merged_23.name.unique()):
    total_games = len(df_merged_23[df_merged_23.name==player])
    games_not_played = sum(df_merged_23[df_merged_23.name==player].minutes == 0)
    mins_played = sum(df_merged_23[df_merged_23.name==player].minutes)
    nans = sum(df_merged_23[df_merged_23.name==player].goals.isna())
    
    if total_games - games_not_played < 5:
#         print(total_games,games_not_played,mins_played)
        df_merged_23 = df_merged_23[df_merged_23.name != player]

print(len(df_merged_23), df_merged_23.goals.isna().sum())

18033 6823


In [8]:
merged_names = set(df_merged_23.name.unique())
understat_names = set(df_understat_23.name)
fpl_names = set(df_2324.name)
merged_names-understat_names

set()

In [9]:
# mins_played = []
# for player in sorted(df_merged.name.unique()):
#     min_played = sum(df_merged[df_merged.name==player].minutes)
#     mins_played.append(min_played)
    
# plt.figure()
# plt.hist(mins_played, bins=50)
# plt.show()

### 2022-23

In [10]:
df_teams_22 = pd.read_csv("data/2022-23/teams.csv")
team_dict_22 = {}
for row in df_teams_22.iloc:
    team_dict_22[row['id']] = row['name']

In [11]:
df_2223 = pd.read_csv("data/2022-23/gws/merged_gw.csv")
df_2223['kickoff_date'] = pd.to_datetime(df_2223['kickoff_time']).dt.date
df_2223['opponent_team'] = df_2223['opponent_team'].map(team_dict_22).fillna(df_2223['opponent_team'])
print(len(df_2223))
df_2223.head()

26505


Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,kickoff_date
0,Nathan Redmond,MID,Southampton,1.5,0,0,3,0,0.0,403,...,0.0,1,0,0,0,55,False,0,1,2022-08-06
1,Junior Stanislas,MID,Bournemouth,1.1,0,0,3,0,0.0,58,...,0.0,1,0,0,0,50,True,0,1,2022-08-06
2,Armando Broja,FWD,Chelsea,2.0,0,0,3,0,0.3,150,...,19.0,1,0,0,0,55,False,0,1,2022-08-06
3,Fabian Schär,DEF,Newcastle,2.4,0,3,43,1,14.6,366,...,25.0,15,0,0,0,45,True,0,1,2022-08-06
4,Jonny Evans,DEF,Leicester,1.9,0,0,15,0,1.3,249,...,0.0,1,0,0,0,45,True,0,1,2022-08-07


In [50]:
player_mapping2 = {}

# Load player mapping dict
df_player_mapping = pd.read_csv('data/2022-23/id_dict.csv')
for row in df_player_mapping.iloc:
    if row['Understat_Name'] != row['FPL_Name']:
        player_mapping2[row['Understat_Name']] = row['FPL_Name']
        
# Manually add mistakes
player_mapping2['Lewis O&#039;Brien'] = "Lewis O'Brien"
player_mapping2['Son Heung-Min'] = 'Son Heung-min'
player_mapping2['Diogo Jota'] = 'Diogo Teixeira da Silva'

In [51]:
# Specify the folder containing your CSV files
folder_path = 'data/2022-23/understat'

# Create an empty list to store individual dataframes
dataframes = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV into a dataframe
        df = pd.read_csv(file_path)
        
        # Extract the player name from the filename (remove .csv)
        player_file_name = filename[:-4]
        
        # Add a new column 'Player' with the player's name
        player_name = (' ').join(player_file_name.split('_')[:-1])
        player_id = player_file_name.split('_')[-1]
        df['name'] = player_name
        df['player_id'] = player_id
        
        # Hardcode for the two Emersons
        if player_name == 'Emerson':
            if player_id == '1245':
                df['name'] = 'Emerson Palmieri dos Santos'
            elif player_id == '7430':
                df['name'] = 'Emerson Leite de Souza Junior'
        
        # Append the dataframe to the list
        dataframes.append(df)

# Combine all the dataframes into one large dataframe
df_understat_22 = pd.concat(dataframes, ignore_index=True)
df_understat_22 = df_understat_22.drop(['assists', 'position'], axis=1)
df_understat_22['date'] = pd.to_datetime(df_understat_22['date']).dt.date
# df_understat_22['name'] = df_understat_22['name'].map(player_mapping).fillna(df_understat_22['name'])
df_understat_22['name'] = df_understat_22['name'].map(player_mapping2).fillna(df_understat_22['name'])
df_understat_22.head()

Unnamed: 0,goals,shots,xG,time,h_team,a_team,h_goals,a_goals,date,id,season,roster_id,xA,key_passes,npg,npxG,xGChain,xGBuildup,name,player_id
0,0,0,0.0,72,Leicester,West Ham,2,1,2023-05-28,18579,2022,601199,0.0,0,0,0.0,0.129769,0.129769,Aaron Cresswell,534
1,0,0,0.0,90,West Ham,Manchester United,1,0,2023-05-07,18550,2022,595569,0.035077,1,0,0.0,0.035077,0.0,Aaron Cresswell,534
2,0,0,0.0,90,Manchester City,West Ham,3,0,2023-05-03,18478,2022,593984,0.0,0,0,0.0,0.051518,0.051518,Aaron Cresswell,534
3,0,0,0.0,39,Crystal Palace,West Ham,4,3,2023-04-29,18536,2022,592169,0.0,0,0,0.0,0.0,0.0,Aaron Cresswell,534
4,0,0,0.0,90,West Ham,Liverpool,1,2,2023-04-26,18526,2022,591622,0.078905,1,0,0.0,0.068012,0.068012,Aaron Cresswell,534


In [52]:
df_merged_22 = pd.merge(df_2223, df_understat_22, how='left', left_on=['name', 'kickoff_date'], right_on=['name', 'date'])
print(len(df_merged_22), df_merged_22.goals.isna().sum())

26505 15094


In [53]:
# Remove players who play less than 5 games all season
for player in sorted(df_merged_22.name.unique()):
    total_games = len(df_merged_22[df_merged_22.name==player])
    games_not_played = sum(df_merged_22[df_merged_22.name==player].minutes == 0)
    mins_played = sum(df_merged_22[df_merged_22.name==player].minutes)
    nans = sum(df_merged_22[df_merged_22.name==player].goals.isna())
    
    if total_games - games_not_played < 5:
#         print(total_games,games_not_played,mins_played)
        df_merged_22 = df_merged_22[df_merged_22.name != player]

print(len(df_merged_22), df_merged_22.goals.isna().sum())

17198 5976


In [49]:
merged_names = set(df_merged_22.name.unique())
understat_names = set(df_understat_22.name)
fpl_names = set(df_2223.name)
merged_names - understat_names

set()

In [None]:
# mins_played = []
# for player in sorted(df_merged_22.name.unique()):
#     min_played = sum(df_merged_22[df_merged_22.name==player].minutes)
#     mins_played.append(min_played)
    
# plt.figure()
# plt.hist(mins_played, bins=50)
# plt.show()

### 2021-22

In [54]:
df_teams_21 = pd.read_csv("data/2021-22/teams.csv")
team_dict_21 = {}
for row in df_teams_21.iloc:
    team_dict_21[row['id']] = row['name']

In [55]:
df_2122 = pd.read_csv("data/2021-22/gws/merged_gw.csv")
df_2122['kickoff_date'] = pd.to_datetime(df_2122['kickoff_time']).dt.date
df_2122['opponent_team'] = df_2122['opponent_team'].map(team_dict_21).fillna(df_2122['opponent_team'])
print(len(df_2122))
df_2122.head()

25447


Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,kickoff_date
0,Eric Bailly,DEF,Man Utd,0.0,0,0,0,0,0.0,286,...,0.0,0,0,0,0,50,True,0,1,2021-08-14
1,Keinan Davis,FWD,Aston Villa,0.4,0,0,0,0,0.0,49,...,0.0,0,0,0,0,45,False,0,1,2021-08-14
2,Ayotomiwa Dele-Bashiru,MID,Watford,0.0,0,0,0,0,0.0,394,...,0.0,0,0,0,0,45,True,0,1,2021-08-14
3,James Ward-Prowse,MID,Southampton,2.3,0,0,20,0,30.5,341,...,0.0,2,0,0,0,65,False,0,1,2021-08-14
4,Bruno Miguel Borges Fernandes,MID,Man Utd,4.4,0,3,61,0,35.9,277,...,59.0,20,0,0,0,120,True,0,1,2021-08-14


In [56]:
player_mapping3 = {
    'Adama Traoré Diarra': 'Adama Traoré',
    'Alex Telles': 'Alex Nicolao Telles',
    'Allan': 'Allan Marques Loureiro',
    'André Tavares Gomes': 'André Filipe Tavares Gomes',
    'Dele Alli': 'Bamidele Alli',
    'Benjamin White': 'Ben White',
    'Ben Chilwell': 'Benjamin Chilwell',
    'Bernardo Veiga de Carvalho e Silva': 'Bernardo Mota Veiga de Carvalho e Silva',
    'Bruno Borges Fernandes': 'Bruno Miguel Borges Fernandes',
    'Cédric Alves Soares': 'Cédric Soares',
    'Chiquinho': 'Francisco Jorge Tomás Oliveira',
    'David De Gea Quintana': 'David de Gea',
    'Diogo Dalot Teixeira': 'José Diogo Dalot Teixeira',
    'Diogo Teixeira da Silva': 'Diogo Jota',
    'Eddie Nketiah': 'Edward Nketiah',
    'Emerson Leite de Souza Junior': 'Emerson Aparecido Leite de Souza Junior',
    'Emiliano Martínez Romero': 'Emiliano Martínez',
    'Fábio Silva': 'Fabio Silva',
    'Fernandinho': 'Fernando Luiz Rosa',
    'Fabián Balbuena': 'Francisco Femenía Far',
    'Trincão': 'Francisco Machado Mota de Castro Trincão',
    'Gabriel dos Santos Magalhães': 'Gabriel Magalhães',
    'Gabriel Martinelli Silva': 'Gabriel Teodoro Martinelli Silva',
    'Hwang Hee-chan': 'Hee-Chan Hwang',
    'Son Heung-min': 'Heung-Min Son',
    'Júnior Firpo': 'Héctor Junior Firpo Adames',
    'Imran Louza': 'Imrân Louza',
    'Jeremy Sarmiento Morante': 'Jeremy Sarmiento',
    'Jóhann Berg Gudmundsson': 'Johann Berg Gudmundsson',
    'Joe Willock': 'Joseph Willock',
    'João Cancelo': 'João Pedro Cavaco Cancelo',
    'Juan Camilo Hernández': 'Juan Camilo Hernández Suárez',
    'Lyanco': 'Lyanco Evangelista Silveira Neves Vojnovic',
    'Lyanco Silveira Neves Vojnovic': 'Lyanco Evangelista Silveira Neves Vojnovic',
    'Marc Cucurella Saseta': 'Marc Cucurella',
    'Matty Cash': 'Matthew Cash',
    'Miguel Almirón Rejala': 'Miguel Almirón',
    'Mohamed Elneny': 'Mohamed Naser El Sayed Elneny',
    'Moisés Caicedo Corozo': 'Moisés Caicedo',
    'Nicolas N&#039;Koulou': 'Nicolas Pépé',
    'Oghenekaro Etebo': 'Oghenekaro Peter Etebo',
    'Oriol Romeu': 'Oriol Romeu Vidal',
    'Pablo Fornals Malla': 'Pablo Fornals',
    'Josh Dasilva': 'Pelenda Joshua Dasilva',
    'Raphinha': 'Raphael Dias Belloli',
    'Rayan Aït-Nouri': 'Rayan Ait Nouri',
    'Ricardo Barbosa Pereira': 'Ricardo Domingos Barbosa Pereira',
    'Ricardo Pereira': 'Ricardo Domingos Barbosa Pereira',
    'Romain Saiss': 'Romain Saïss',
    'Rúben Vinagre': 'Rúben Diogo da Silva Neves',
    'Rúben da Silva Neves': 'Rúben Santos Gato Alves Dias',
    'Samir': 'Samir Caetano de Souza Santos',
    'Sergi Canós Tenés': 'Sergi Canós',
    'Solly March': 'Solomon March',
    'Tanguy NDombele Alvaro': 'Tanguy Ndombele',
}


In [77]:
player_mapping3 = {}

# Load player mapping dict
df_player_mapping = pd.read_csv('data/2021-22/id_dict.csv')
for row in df_player_mapping.iloc:
    if row[' Understat_Name'] != row[' FPL_Name']:
        player_mapping3[row[' Understat_Name']] = row[' FPL_Name']
        
# # Manually add mistakes
player_mapping3['Emerson Leite de Souza Junior'] = 'Emerson Aparecido Leite de Souza Junior'
# player_mapping3['Son Heung-Min'] = 'Son Heung-min'
# player_mapping3['Diogo Jota'] = 'Diogo Teixeira da Silva'

In [78]:
# Specify the folder containing your CSV files
folder_path = 'data/2021-22/understat'

# Create an empty list to store individual dataframes
dataframes = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV into a dataframe
        df = pd.read_csv(file_path)
        
        # Extract the player name from the filename (remove .csv)
        player_file_name = filename[:-4]
        
        # Add a new column 'Player' with the player's name
        player_name = (' ').join(player_file_name.split('_')[:-1])
        player_id = player_file_name.split('_')[-1]
        df['name'] = player_name
        df['player_id'] = player_id
        
        # Hardcode for the two Emersons
        if player_name == 'Emerson':
            if player_id == '1245':
                df['name'] = 'Emerson Palmieri dos Santos'
            elif player_id == '7430':
                df['name'] = 'Emerson Leite de Souza Junior'
        
        # Append the dataframe to the list
        dataframes.append(df)

# Combine all the dataframes into one large dataframe
df_understat_21 = pd.concat(dataframes, ignore_index=True)
df_understat_21 = df_understat_21.drop(['assists', 'position'], axis=1)
df_understat_21['date'] = pd.to_datetime(df_understat_21['date']).dt.date
# df_understat_21['name'] = df_understat_21['name'].map(player_mapping).fillna(df_understat_21['name'])
# df_understat_21['name'] = df_understat_21['name'].map(player_mapping2).fillna(df_understat_21['name'])
df_understat_21['name'] = df_understat_21['name'].map(player_mapping3).fillna(df_understat_21['name'])
df_understat_21.head()

Unnamed: 0,goals,shots,xG,time,h_team,a_team,h_goals,a_goals,date,id,season,roster_id,xA,key_passes,npg,npxG,xGChain,xGBuildup,name,player_id
0,0,0,0.0,64,Brighton,Wolverhampton Wanderers,0,1,2021-12-15,16538,2021,503282,0.0,0,0,0.0,0.0,0.0,Aaron Connolly,7991
1,0,1,0.036493,34,Southampton,Brighton,1,1,2021-12-04,16521,2021,499911,0.380411,1,0,0.036493,0.416904,0.0,Aaron Connolly,7991
2,0,0,0.0,12,Crystal Palace,Brighton,1,1,2021-09-27,16435,2021,486003,0.0,0,0,0.0,0.0,0.0,Aaron Connolly,7991
3,0,1,0.562996,42,Brighton,Watford,2,0,2021-08-21,16391,2021,477240,0.0,0,0,0.562996,0.591031,0.028035,Aaron Connolly,7991
4,0,1,0.051674,44,Arsenal,Brighton,2,0,2021-05-23,14805,2020,473529,0.010887,1,0,0.051674,0.06256,0.0,Aaron Connolly,7991


In [79]:
df_merged_21 = pd.merge(df_2122, df_understat_21, how='left', left_on=['name', 'kickoff_date'], right_on=['name', 'date'])
print(len(df_merged_21), df_merged_21.goals.isna().sum())

25447 14866


In [80]:
# Remove players who play less than 5 games all season
for player in sorted(df_merged_21.name.unique()):
    total_games = len(df_merged_21[df_merged_21.name==player])
    games_not_played = sum(df_merged_21[df_merged_21.name==player].minutes == 0)
    mins_played = sum(df_merged_21[df_merged_21.name==player].minutes)
    nans = sum(df_merged_21[df_merged_21.name==player].goals.isna())
    
    if total_games - games_not_played < 5:
#         print(total_games,games_not_played,mins_played)
        df_merged_21 = df_merged_21[df_merged_21.name != player]

print(len(df_merged_21), df_merged_21.goals.isna().sum())

16585 6246


In [81]:
merged_names = set(df_merged_21.name.unique())
understat_names = set(df_understat_21.name)
fpl_names = set(df_2223.name)
merged_names - understat_names

set()

### Merge

In [186]:
# Assuming df_merged_21, df_merged_22, df_merged_23 are already defined
df_all = pd.concat([df_merged_21, df_merged_22, df_merged_23], ignore_index=True)

# Optionally, reset the index
df_all.reset_index(drop=True, inplace=True)

# Display the combined DataFrame
print(len(df_all))

51816


In [187]:
# df_all.to_csv("merged_seasons.csv", index=False)

# Make training data

In [212]:
fpl_name_mapping = {
    'Adama Traoré Diarra': 'Adama Traoré',
    'André Filipe Tavares Gomes': 'André Tavares Gomes',
    'Arnaut Danjuma Groeneveld': 'Arnaut Danjuma',
    'Benjamin Chilwell': 'Ben Chilwell',
    'Benjamin White': 'Ben White',
    'Bernardo Mota Veiga de Carvalho e Silva': 'Bernardo Veiga de Carvalho e Silva',
    'Bruno Miguel Borges Fernandes': 'Bruno Borges Fernandes',
    'Cédric Alves Soares': 'Cédric Soares',
    'David de Gea': 'David De Gea Quintana',
    'Diogo Teixeira da Silva': 'Diogo Jota',
    'Emerson Aparecido Leite de Souza Junior': 'Emerson Leite de Souza Junior',
    'Emiliano Martínez Romero': 'Emiliano Martínez',
    'Gabriel Teodoro Martinelli Silva': 'Gabriel Martinelli Silva',
    'Héctor Junior Firpo Adames': 'Junior Firpo Adames',
    'Hee-Chan Hwang': 'Hwang Hee-Chan',
    'Hwang Hee-chan': 'Hwang Hee-Chan',
    'Heung-Min Son': 'Son Heung-Min',
    'Son Heung-min': 'Son Heung-Min',
    'Jeremy Sarmiento Morante': 'Jeremy Sarmiento',
    'João Pedro Cavaco Cancelo': 'João Cancelo',
    'Joseph Gomez': 'Joe Gomez',
    'Joseph Willock': 'Joe Willock',
    'Luis Sinisterra Lucumí': 'Luis Sinisterra',
    'Lyanco Evangelista Silveira Neves Vojnovic': 'Lyanco Silveira Neves Vojnovic',
    'Marc Cucurella Saseta': 'Marc Cucurella',
    'Mateo Kovačić': 'Mateo Kovacic',
    'Matthew Cash': 'Matty Cash',
    'Miguel Almirón Rejala': 'Miguel Almirón',
    'Mohamed Naser El Sayed Elneny': 'Mohamed Elneny',
    'Moisés Caicedo Corozo': 'Moisés Caicedo',
    'Pablo Fornals Malla': 'Pablo Fornals',
    'Rayan Ait Nouri': 'Rayan Aït-Nouri',
    'Ricardo Domingos Barbosa Pereira': 'Ricardo Barbosa Pereira',
    'Rúben Diogo da Silva Neves': 'Rúben da Silva Neves',
    'Rúben Santos Gato Alves Dias': 'Rúben Gato Alves Dias',
    'Sergi Canós Tenés': 'Sergi Canós',
    'Vladimir Coufal': 'Vladimír Coufal',
    
}

In [213]:
df_all['name'] = df_all['name'].map(fpl_name_mapping).fillna(df_all['name'])

In [215]:
# sorted(df_all.name.unique())

In [None]:
df_all.columns

In [None]:
# exp_gls = sorted(df_all.xG)
# plt.figure()
# plt.hist(exp_gls, bins=250)
# plt.show()

In [None]:
# exp_gls = sorted(df_all.expected_goals)
# plt.figure()
# plt.hist(exp_gls, bins=250)
# plt.show()

In [None]:
# keys_to_select = [
#     'goals_scored',
#     'xG',
#     'npxG', 
#     'expected_goals',
#     'xA',
#     'expected_assists', 
#     'key_passes',
#     'xGChain', 
#     'xGBuildup',
#     'expected_goal_involvements', 
#     'expected_goals_conceded',
# ]

### Define key stats and mappings

In [None]:
keys_to_select = [
    # Overall
    'total_points',
    'minutes',
    'transfers_in',
    'transfers_out',

    
    # Attacking
    'goals_scored',
    'assists',
    'shots',
    'xG',
    'xA',
    'key_passes',
    'npg', 
    'npxG', 
    'xGChain', 
    'xGBuildup',
    'expected_assists', 
    'expected_goal_involvements', 
    'expected_goals',
    
    # Defensive
    'clean_sheets',
    'goals_conceded',
    'saves',
    'penalties_saved',
    'expected_goals_conceded',
    'own_goals',
    
    # Cards
    'yellow_cards',
    'red_cards',
    
    # Penalties
    'penalties_missed',
    
    # Bonus and ICT
    'bonus',
    'bps',
    'threat',
    'influence',
    'creativity',
]

In [None]:
# Define team mapping
team_mapping = {
    'Arsenal': 1,
    'Aston Villa': 2,
    'Bournemouth': 3,
    'Brentford': 4,
    'Brighton': 5,
    'Chelsea': 6,
    'Crystal Palace': 7,
    'Everton': 8,
    'Fulham': 9,
    'Ipswich': 10,
    'Leicester': 11,
    'Liverpool': 12,
    'Man City': 13,
    'Man Utd': 14,
    'Newcastle': 15,
    'Nott\'m Forest': 16,
    'Southampton': 17,
    'Spurs': 18,
    'West Ham': 19,
    'Wolves': 20,
    'Burnley': 21,
    'Luton': 22,
    'Sheffield Utd': 23,
    'Leeds': 24,
    'Watford': 25,
    'West Brom': 26,
    'Norwich': 27
}

In [None]:
# Define position mapping
pos_mapping = {'GK': 1,
              'GKP': 1,
              'DEF': 2,
              'MID': 3,
              'FWD': 4}

In [None]:
def one_hot_encode_team(team_id, num_teams=20):
    # Create a zero array of length num_teams
    one_hot = np.zeros(num_teams)
    
    # Set the correct index to 1 (team_id - 1 because team_id starts from 1)
    one_hot[team_id - 1] = 1
    
    return one_hot

In [None]:
# for col in df_all.columns:
#     print(col, sum(df_all[col].isna()))

In [None]:
df_all.opponent_team.value_counts()

In [None]:
# Fill all NaN's with 0
df_all = df_all.fillna(0)

# Add team and position IDs
df_all['team_id'] = df_all['team'].map(team_mapping)
df_all['opp_team_id'] = df_all['opponent_team'].map(team_mapping)
df_all['pos_id'] = df_all['position'].map(pos_mapping)

# Show
df_all.head()

### All seasons combined

In [None]:
# Load data
df_all = pd.read_csv("data/cleaned_merged_seasons.csv")

# Drop NaNs
df_all = df_all.dropna(subset=['team_x'])
df_all = df_all.dropna(subset=keys_to_select_part)

# Add team and position IDs
df_all['team_id'] = df_all['team_x'].map(team_mapping)
df_all['opp_team_id'] = df_all['opp_team_name'].map(team_mapping)
df_all['pos_id'] = df_all['position'].map(pos_mapping)

# Show
df_all.head()

In [None]:
for key in keys_to_select_part:
    num_nans = df_all[key].isna().sum()  # Counts the number of NaNs in each column
    print(f"{key}: {num_nans} NaNs")

In [None]:
X = []
y = []

for player in df_all.name.unique():
    df_temp = df_all[df_all.name == player].copy()
    if len(df_temp) < 6:
        continue
    
    for i in range(5, len(df_temp)):
        
        # Select row for next game
        next_game = df_temp.iloc[i]
        
#         if next_game['minutes'] == 0:
#             continue 
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['team_id'], 27)
        opponent_team = one_hot_encode_team(next_game['opp_team_id'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(next_game['pos_id'], 4)
        
        # Combine into one vector
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Get player stats from previous games
        for j in range(1,6):
            row_temp = df_temp.iloc[i-j]
            
            # Get home/away encoding
            home_away = one_hot_encode_team(row_temp['was_home'], 2)
            
            # Add player team goals and opponent team goals
            if row_temp.was_home:
                score = np.array(row_temp[['team_h_score', 'team_a_score']].astype(float).values)
            else:
                score = np.array(row_temp[['team_a_score','team_h_score']].astype(float).values)
            
            # Select the wanted stats and convert to float numpy array
            selected_stats = np.array(row_temp[keys_to_select].astype(float).values) 
                
            # Combine into one vector
            combined_stats = np.concatenate([combined_stats, home_away, score, selected_stats])
            
        # Append combined stats to X and points (y value) to y
        X.append(combined_stats)
        y.append(points)
        
        
# Convert lists to numpy arrays for training/testing
X = np.array(X)
y = np.array(y)

print("Training data (X):", X.shape)
print("Target values (y):", y.shape)

# Load data 2024-25

### Load overall data

In [None]:
# URL for the FPL API
url = "https://fantasy.premierleague.com/api/bootstrap-static/"

# Make an HTTP GET request to fetch the data
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    print(data.keys())

    # Save the data to a file
    with open("data/overall_data.json", "w") as f:
        json.dump(data, f)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
    
# Load the saved JSON data from the file
with open("data/overall_data.json", "r") as f:
    data = json.load(f)

In [None]:
# Extract data
events = data['events']
game_settings = data['game_settings']
phases = data['phases']
teams = data['teams']
total_players = data['total_players']
players = data['elements']
element_stats = data['element_stats']
element_types = data['element_types']

In [None]:
player_overall_stats = {}
for player in players:
    player_id = str(player['id'])
    player_overall_stats[player_id] = player

In [None]:
# Show players with highest xPoints for next gameweek
for player in players:
    if float(player['ep_next']) > 7:
        print(player['web_name'], player['ep_next'])

### Load fixture information

In [None]:
# Get all fixtures from the FPL API
fixtures_url = "https://fantasy.premierleague.com/api/fixtures/"
fixtures_response = requests.get(fixtures_url)

if response.status_code == 200:
    # Parse the fixture data
    fixtures = fixtures_response.json()
    fixtures_data = {str(element['id']): element for element in fixtures}

    # Save the data to a file
    with open("data/fixture_data.json", "w") as f:
        json.dump(fixtures_data, f)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
# Load the saved JSON data from the file
with open("data/fixture_data.json", "r") as f:
    fixtures_data = json.load(f)

### Load player stats

In [None]:
player_stats = {}
for player_id in player_overall_stats.keys():
#     print(player_id)
    url = f"https://fantasy.premierleague.com/api/element-summary/{player_id}/"
    response = requests.get(url)
    
    if response.status_code == 200:
        # Get player stats for played gameweeks
        player_history = response.json()['history']
        for i in range(len(player_history)):
            player_fixture = fixtures_data[str(player_history[i]['fixture'])]
            if player_history[i]['was_home'] == True:
                player_history[i]['player_team'] = player_fixture['team_h']
                player_history[i]['team_goals'] = player_fixture['team_h_score']
            else:
                player_history[i]['player_team'] = player_fixture['team_a']
                player_history[i]['team_goals'] = player_fixture['team_a_score']
        player_history = {str(element['round']): element for element in player_history}
        
        # Get player upcoming fixtures
        player_fixtures = response.json()['fixtures']
        player_fixtures = {str(element['event']): element for element in player_fixtures}
        
        # Add to overall player stats dictionary
        player_stats[player_id] = {'fixtures': player_fixtures, 'history': player_history}
    
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        
# Save the data to a file
with open("data/player_data.json", "w") as f:
    json.dump(player_stats, f)

# Load the saved JSON data from the file
with open("data/player_data.json", "r") as f:
    player_stats = json.load(f)

In [None]:
player_stats['1']['history']['1']

### Load gameweek data

In [None]:
gameweeks_data = {}

for i in range(1, 9):
    url = f"https://fantasy.premierleague.com/api/event/{i}/live/"
    response = requests.get(url)
    
    # Convert each gameweek list of players to dict with player ID as key
    elements_list = response.json()['elements']
    elements_dict = {str(element['id']): element for element in elements_list}
    
    if response.status_code == 200:
        gameweeks_data[f'gameweek_{i}'] = elements_dict
    else:
        print(f"Failed to retrieve gameweek {i} data. Status code: {response.status_code}")
        
# Save the data to a file
with open("data/gameweeks_data.json", "w") as f:
    json.dump(gameweeks_data, f)

# Load the saved JSON data from the file
with open("data/gameweeks_data.json", "r") as f:
    gameweeks_data = json.load(f)

# Make training data

In [None]:
keys_to_select = [
    # Overall
    'total_points',
    'starts',
    'minutes',
    
    # Attacking
    'goals_scored',
    'assists',
    'expected_goals', #
    'expected_assists', #
    
    # Defensive
    'clean_sheets',
    'goals_conceded',
    'saves',
    'penalties_saved',
    'expected_goals_conceded', #
    'own_goals',
    
    # Cards
    'yellow_cards',
    'red_cards',
    
    # Penalties
    'penalties_missed',
    
    # Bonus and ICT
    'bonus',
    'bps',
    'threat',
    'influence',
    'creativity',
]

keys_to_select_part = [
    # Overall
    'total_points',
#     'starts',
    'minutes',
    'transfers_in',
    'transfers_out',

    
    # Attacking
    'goals_scored',
    'assists',
#     'expected_goals',
#     'expected_assists',
    
    # Defensive
    'clean_sheets',
    'goals_conceded',
    'saves',
    'penalties_saved',
#     'expected_goals_conceded',
    'own_goals',
    
    # Cards
    'yellow_cards',
    'red_cards',
    
    # Penalties
    'penalties_missed',
    
    # Bonus and ICT
    'bonus',
    'bps',
    'threat',
    'influence',
    'creativity',
]

### Gameweeks 2

In [None]:
# Initialize empty lists for features and target values
X_GW2 = []
y_GW2 = []
ids_GW2 = []

player_ids = gameweeks_data['gameweek_2'].keys()

for player_id in player_ids:
    if '1' in player_stats[player_id]['history'].keys():
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['2']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']

        # Get GW1 stats for player
        gw1_row = player_stats[player_id]['history']['1']
        gw1_home_away = one_hot_encode_team(gw1_row['was_home'], 2)
        if gw1_row['was_home']:
            gw1_score = np.array([gw1_row['team_h_score'], gw1_row['team_a_score']])
        else:
            gw1_score = np.array([gw1_row['team_a_score'], gw1_row['team_h_score']])
        selected_stats = {key: gw1_row[key] for key in keys_to_select_part if key in gw1_row}
        gw1_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        gw1_combined = np.concatenate([gw1_home_away, gw1_score, gw1_stats])
        
        # Combine all stats
        combined_stats = np.concatenate([player_team,
                                        opponent_team,
                                        home_away,
                                        position,
                                        gw1_combined,
                                        gw1_combined,
                                        gw1_combined,
                                        gw1_combined,
                                        gw1_combined
                                        ])
        
        # Append combined stats to X and points (y value) to y
        X_GW2.append(combined_stats)
        y_GW2.append(points)
        ids_GW2.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW2 = np.array(X_GW2)
y_GW2 = np.array(y_GW2)

print("Training data (X):", X_GW2.shape)
print("Target values (y):", y_GW2.shape)

### Gameweek 3

In [None]:
# Initialize empty lists for features and target values
X_GW3 = []
y_GW3 = []
ids_GW3 = []

player_ids = gameweeks_data['gameweek_3'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['1', '2']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['3']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Get GW2 stats for player
        gw2_row = player_stats[player_id]['history']['2']
        gw2_home_away = one_hot_encode_team(gw2_row['was_home'], 2)
        if gw2_row['was_home']:
            gw2_score = np.array([gw2_row['team_h_score'], gw2_row['team_a_score']])
        else:
            gw2_score = np.array([gw2_row['team_a_score'], gw2_row['team_h_score']])
        selected_stats = {key: gw2_row[key] for key in keys_to_select_part if key in gw2_row}
        gw2_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        gw2_combined = np.concatenate([gw2_home_away, gw2_score, gw2_stats])
        
        # Get GW1 stats for player
        gw1_row = player_stats[player_id]['history']['1']
        gw1_home_away = one_hot_encode_team(gw1_row['was_home'], 2)
        if gw1_row['was_home']:
            gw1_score = np.array([gw1_row['team_h_score'], gw1_row['team_a_score']])
        else:
            gw1_score = np.array([gw1_row['team_a_score'], gw1_row['team_h_score']])
        selected_stats = {key: gw1_row[key] for key in keys_to_select_part if key in gw1_row}
        gw1_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        gw1_combined = np.concatenate([gw1_home_away, gw1_score, gw1_stats])

        # Combine all stats
        combined_stats = np.concatenate([player_team,
                                        opponent_team,
                                        home_away,
                                        position,
                                        gw2_combined,
                                        gw1_combined,
                                        gw1_combined,
                                        gw1_combined,
                                        gw1_combined
                                        ])
        
        # Append combined stats to X and points (y value) to y
        X_GW3.append(combined_stats)
        y_GW3.append(points)
        ids_GW3.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW3 = np.array(X_GW3)
y_GW3 = np.array(y_GW3)

print("Training data (X):", X_GW3.shape)
print("Target values (y):", y_GW3.shape)

### Gameweek 4

In [None]:
# Initialize empty lists for features and target values
X_GW4 = []
y_GW4 = []
ids_GW4 = []

player_ids = gameweeks_data['gameweek_4'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['1', '2', '3']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['4']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get GW1 stats for player
        for i in range(3,0,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Fill the remaining ones with duplicates of gw 1
        combined_stats = np.concatenate([combined_stats, gw_combined, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW4.append(combined_stats)
        y_GW4.append(points)
        ids_GW4.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW4 = np.array(X_GW4)
y_GW4 = np.array(y_GW4)

print("Training data (X):", X_GW4.shape)
print("Target values (y):", y_GW4.shape)

### Gameweek 5

In [None]:
# Initialize empty lists for features and target values
X_GW5 = []
y_GW5 = []
ids_GW5 = []

player_ids = gameweeks_data['gameweek_5'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['1', '2', '3', '4']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['5']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get GW1 stats for player
        for i in range(4,0,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Fill the remaining ones with duplicates of gw 1
        combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW5.append(combined_stats)
        y_GW5.append(points)
        ids_GW5.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW5 = np.array(X_GW5)
y_GW5 = np.array(y_GW5)

print("Training data (X):", X_GW5.shape)
print("Target values (y):", y_GW5.shape)

### Gameweek 6

In [None]:
# Initialize empty lists for features and target values
X_GW6 = []
y_GW6 = []
ids_GW6 = []

player_ids = gameweeks_data['gameweek_6'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['1', '2', '3', '4', '5']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['6']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get GW1 stats for player
        for i in range(5,0,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW6.append(combined_stats)
        y_GW6.append(points)
        ids_GW6.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW6 = np.array(X_GW6)
y_GW6 = np.array(y_GW6)

print("Training data (X):", X_GW6.shape)
print("Target values (y):", y_GW6.shape)

### Gameweek 7

In [None]:
# Initialize empty lists for features and target values
X_GW7 = []
y_GW7 = []
ids_GW7 = []

player_ids = gameweeks_data['gameweek_7'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['2', '3', '4', '5', '6']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['7']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get GW1 stats for player
        for i in range(6,1,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW7.append(combined_stats)
        y_GW7.append(points)
        ids_GW7.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW7 = np.array(X_GW7)
y_GW7 = np.array(y_GW7)

print("Training data (X):", X_GW7.shape)
print("Target values (y):", y_GW7.shape)

### Gameweek 8

In [None]:
# Initialize empty lists for features and target values
X_GW8 = []
y_GW8 = []
ids_GW8 = []

player_ids = gameweeks_data['gameweek_8'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['3', '4', '5', '6', '7']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['history']['8']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        player_team = one_hot_encode_team(next_game['player_team'], 27)
        opponent_team = one_hot_encode_team(next_game['opponent_team'], 27)
        home_away = one_hot_encode_team(next_game['was_home'], 2)
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = next_game['total_points']
        
        # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        # Get GW1 stats for player
        for i in range(7,2,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW8.append(combined_stats)
        y_GW8.append(points)
        ids_GW8.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW8 = np.array(X_GW8)
y_GW8 = np.array(y_GW8)

print("Training data (X):", X_GW8.shape)
print("Target values (y):", y_GW8.shape)

### Gameweek 9 (predictions)

In [None]:
# Initialize empty lists for features and target values
X_GW9 = []
y_GW9 = []
ids_GW9 = []

X_GW10 = []
y_GW10 = []
ids_GW10 = []

X_GW11 = []
y_GW11 = []
ids_GW11 = []

# # # Initialize empty lists for features and target values
# X_test = []
# y_test = []
# player_ids_test = []

player_ids = gameweeks_data['gameweek_8'].keys()

for player_id in player_ids:
    if all(key in player_stats[player_id]['history'].keys() for key in ['4', '5', '6', '7', '8']):
        
        # Get stats for upcoming game
        next_game = player_stats[player_id]['fixtures']['9']
        next_game2 = player_stats[player_id]['fixtures']['10']
        next_game3 = player_stats[player_id]['fixtures']['11']
        
#         if next_game['minutes'] == 0:
#             continue
        
        # Get one-hot encoding for players team, opp team, home/away and position
        if player_overall_stats[player_id]['team'] == fixtures_data[str(next_game['id'])]['team_h']:
            player_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_h'], 27)
            opponent_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_a'], 27)
            home_away = one_hot_encode_team(1, 2)
        elif player_overall_stats[player_id]['team'] == fixtures_data[str(next_game['id'])]['team_a']:
            player_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_a'], 27)
            opponent_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_h'], 27)
            home_away = one_hot_encode_team(0, 2)
        else:
            print('fail')
        position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
        # Get points scored (y value)
        points = float(player_overall_stats[player_id]['ep_next'])
        
         # Combine next game stats
        combined_stats = np.concatenate([player_team, opponent_team, home_away, position])
        
        
        # Get one-hot encoding for players team, opp team, home/away and position
        if player_overall_stats[player_id]['team'] == fixtures_data[str(next_game2['id'])]['team_h']:
            player_team2 = one_hot_encode_team(fixtures_data[str(next_game2['id'])]['team_h'], 27)
            opponent_team2 = one_hot_encode_team(fixtures_data[str(next_game2['id'])]['team_a'], 27)
            home_away2 = one_hot_encode_team(1, 2)
        elif player_overall_stats[player_id]['team'] == fixtures_data[str(next_game2['id'])]['team_a']:
            player_team2 = one_hot_encode_team(fixtures_data[str(next_game2['id'])]['team_a'], 27)
            opponent_team2 = one_hot_encode_team(fixtures_data[str(next_game2['id'])]['team_h'], 27)
            home_away2 = one_hot_encode_team(0, 2)
        else:
            print('fail')
        
         # Combine next game stats
        combined_stats2 = np.concatenate([player_team2, opponent_team2, home_away2, position])
        
        
        
        # Get one-hot encoding for players team, opp team, home/away and position
        if player_overall_stats[player_id]['team'] == fixtures_data[str(next_game3['id'])]['team_h']:
            player_team3 = one_hot_encode_team(fixtures_data[str(next_game3['id'])]['team_h'], 27)
            opponent_team3 = one_hot_encode_team(fixtures_data[str(next_game3['id'])]['team_a'], 27)
            home_away3 = one_hot_encode_team(1, 2)
        elif player_overall_stats[player_id]['team'] == fixtures_data[str(next_game3['id'])]['team_a']:
            player_team3 = one_hot_encode_team(fixtures_data[str(next_game3['id'])]['team_a'], 27)
            opponent_team3 = one_hot_encode_team(fixtures_data[str(next_game3['id'])]['team_h'], 27)
            home_away3 = one_hot_encode_team(0, 2)
        else:
            print('fail')
        
         # Combine next game stats
        combined_stats3 = np.concatenate([player_team, opponent_team, home_away, position])
        
        
        # Get GW1 stats for player
        for i in range(8,3,-1):
            gw_row = player_stats[player_id]['history'][str(i)]
            gw_home_away = one_hot_encode_team(gw_row['was_home'], 2)
            if gw_row['was_home']:
                gw_score = np.array([gw_row['team_h_score'], gw_row['team_a_score']])
            else:
                gw_score = np.array([gw_row['team_a_score'], gw_row['team_h_score']])
            selected_stats = {key: gw_row[key] for key in keys_to_select_part if key in gw_row}
            gw_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
            gw_combined = np.concatenate([gw_home_away, gw_score, gw_stats])
            
            # Add gameweek stats to combined
            combined_stats = np.concatenate([combined_stats, gw_combined])
            combined_stats2 = np.concatenate([combined_stats2, gw_combined])
            combined_stats3 = np.concatenate([combined_stats3, gw_combined])
        
        # Append combined stats to X and points (y value) to y
        X_GW9.append(combined_stats)
        y_GW9.append(points)
        ids_GW9.append(player_id)
        
        X_GW10.append(combined_stats2)
        y_GW10.append(points)
        ids_GW10.append(player_id)
        
        X_GW11.append(combined_stats3)
        y_GW11.append(points)
        ids_GW11.append(player_id)
        
        
#         # Get stats for upcoming game
#         next_game = player_stats[player_id]['fixtures']['8']
# #         if next_game['minutes'] == 0:
# #             continue

#         # Expected points
#         expected_points = float(player_overall_stats[player_id]['ep_next'])
# #         expected_points = np.log(max(float(player_overall_stats[player_id]['ep_next'])+1,1)) / 3


#         # Home and away teams
#         if player_overall_stats[player_id]['team'] == fixtures_data[str(next_game['id'])]['team_h']:
#             player_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_h'])
#             opponent_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_a'])
#             home_away = one_hot_encode_team(1, 2)
#         elif player_overall_stats[player_id]['team'] == fixtures_data[str(next_game['id'])]['team_a']:
#             player_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_a'])
#             opponent_team = one_hot_encode_team(fixtures_data[str(next_game['id'])]['team_h'])
#             home_away = one_hot_encode_team(0, 2)
#         else:
#             print('fail')
        
#         # Get overall player stats
#         position = one_hot_encode_team(player_overall_stats[player_id]['element_type'], 4)
        
#         # Get GW7 stats for player
#         gw7_all_stats = player_stats[player_id]['history']['7']
#         gw7_home_away = one_hot_encode_team(gw7_all_stats['was_home'], 2)
#         selected_stats = {key: gw7_all_stats[key] for key in keys_to_select if key in gw7_all_stats}
#         gw7_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        
#         # Get GW6 stats for player
#         gw6_all_stats = player_stats[player_id]['history']['6']
#         gw6_home_away = one_hot_encode_team(gw6_all_stats['was_home'], 2)
#         selected_stats = {key: gw6_all_stats[key] for key in keys_to_select if key in gw6_all_stats}
#         gw6_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        
#         # Get GW6 stats for player
#         gw6_all_stats = player_stats[player_id]['history']['6']
#         gw6_home_away = one_hot_encode_team(gw6_all_stats['was_home'], 2)
#         selected_stats = {key: gw6_all_stats[key] for key in keys_to_select if key in gw6_all_stats}
#         gw6_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        
#         # Get GW5 stats for player
#         gw5_all_stats = player_stats[player_id]['history']['5']
#         gw5_home_away = one_hot_encode_team(gw5_all_stats['was_home'], 2)
#         selected_stats = {key: gw5_all_stats[key] for key in keys_to_select if key in gw5_all_stats}
#         gw5_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        
#         # Get GW4 stats for player
#         gw4_all_stats = player_stats[player_id]['history']['4']
#         gw4_home_away = one_hot_encode_team(gw4_all_stats['was_home'], 2)
#         selected_stats = {key: gw4_all_stats[key] for key in keys_to_select if key in gw4_all_stats}
#         gw4_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        
#         # Get GW3 stats for player
#         gw3_all_stats = player_stats[player_id]['history']['3']
#         gw3_home_away = one_hot_encode_team(gw3_all_stats['was_home'], 2)
#         selected_stats = {key: gw3_all_stats[key] for key in keys_to_select if key in gw3_all_stats}
#         gw3_stats = np.array([float(item) for item in selected_stats.values()], dtype=float)
        

#         # Combine all stats
#         combined_stats = np.concatenate([player_team,
#                                         opponent_team,
#                                         home_away,
#                                         position,
#                                         gw7_home_away,
#                                         gw7_stats,
#                                         gw6_home_away,
#                                         gw6_stats,
#                                         gw5_home_away,
#                                         gw5_stats,
#                                         gw4_home_away,
#                                         gw4_stats,
#                                         gw3_home_away,
#                                         gw3_stats
#                                         ])
        
#         # Append combined stats to X and points (y value) to y
#         X_test.append(combined_stats)
#         y_test.append(expected_points)
#         player_ids_test.append(player_id)
        
#     else:
#         print(f'{player_id} not in gameweek 1')
        
# Convert lists to numpy arrays for training/testing
X_GW9 = np.array(X_GW9)
y_GW9 = np.array(y_GW9)

X_GW10 = np.array(X_GW10)
y_GW10 = np.array(y_GW10)

X_GW11 = np.array(X_GW11)
y_GW11 = np.array(y_GW11)

print("Training data (X):", X_GW9.shape)
print("Target values (y):", y_GW9.shape)

print("Training data (X):", X_GW10.shape)
print("Target values (y):", y_GW10.shape)

print("Training data (X):", X_GW11.shape)
print("Target values (y):", y_GW11.shape)

# Train model - xgboost

### Imports and data split

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Split the data into training and testing sets
y_clipped = np.clip(y, 0, 12)
# X_train, X_val, y_train, y_val, xP_train, xP_val = train_test_split(X, y_clipped, xP, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y_clipped, test_size=0.2, random_state=42)

# Normalize the features (optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

### Train model

In [None]:
# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=20)
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
print(f'Mean Squared Error: {mse:.2f}')

### Show results - Validation set

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(y_val)
sorted_y_val = y_val[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_y_val, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Expected vs True Values for Validation Set')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(xP_val)
sorted_xP_val = xP_val[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# sorted_indices = np.argsort(sorted_xP)
# sorted_xP_val = sorted_xP_val[sorted_indices]
# sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_xP_val, label='Expected points', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Predictions vs xP - Validation Set')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(y_val)
sorted_y_val = y_val[sorted_indices]
sorted_xP_val = xP_val[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_xP_val, label='Expected Points', color='orange', linewidth=2)
plt.plot(sorted_y_val, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('xP vs True Values - Validation Set')
plt.legend()
plt.grid(True)
plt.show()

### Show results - Test set

In [None]:
# Transform X_test
X_test = scaler.transform(X_test)

# Make predictions on the test data
predictions = model.predict(X_test)

# Highest predictions
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

for i in top_10_indices:
    pp = player_overall_stats[str(player_ids_test[i])]
    print(pp['web_name'], predictions[i], y_test[i])

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs FPL expected values')
plt.legend()
plt.grid(True)
plt.show()# Sort values for a better line plot
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

In [None]:
X2 = X[y_clipped!=0]

In [None]:
y_clipped2 = y_clipped[y_clipped!=0]

In [None]:
len(y_clipped)

In [None]:
for i in range(13):
    print(i, sum(y_clipped==i))

In [None]:
sum(y_clipped==0), sum(y_clipped==1), sum((y_clipped >= 2) &
                                          (y_clipped <= 4)), sum((y_clipped >= 5) &
                                                                 (y_clipped <= 6)), sum(y_clipped > 6)

In [None]:
10000/19646, 10000/10606, 10000/10697, 10000/3192, 10000/3550

# Train model - Neural Network

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization

In [None]:
# Create augmentations of the y values
y_clipped = np.clip(y, 0, 12)
y_log = np.log1p(np.maximum(y_clipped, 0))
y_sqrt = np.minimum(np.sqrt(np.maximum(y, 0))/3,1)
y_sqrt = np.sqrt(np.maximum(y, 0))

In [None]:
def weighted_mae(y_true, y_pred):
    
    # Define weights; adjust according to your target distribution
    weights = tf.where(y_true > 2, 2.0, 1.0)
    return tf.reduce_mean(weights * tf.abs(y_true - y_pred)**2)

def weighted_mae(y_true, y_pred):
    # Compute absolute error
    absolute_error = tf.abs(y_true - y_pred)
    
    # Apply weights based on the true values
    weights = tf.where(y_true > 3.5, 2.0, 1.0)
    weighted_error = (absolute_error * weights)**2
    
    return tf.reduce_mean(weighted_error)

# Define a custom weighted loss function
def weighted_mae(y_true, y_pred):
    # Calculate absolute errors
    abs_error = tf.abs(y_true - y_pred)

    # Define weights based on the conditions
    weights = tf.where(
        tf.equal(y_true, 0), 0.5090094675760969,  # Weight for y=0
        tf.where(
            tf.equal(y_true, 1), 0.9428625306430323,  # Weight for y=1
            tf.where(
                tf.logical_and(y_true >= 2, y_true <= 4), 0.9348415443582313,  # Weight for 2 <= y <= 4
                tf.where(
                    tf.logical_and(y_true >= 5, y_true <= 6), 3.1328320802005014,  # Weight for 5 <= y <= 6
                    tf.where(y_true > 6, 2.816901408450704, 1.0)  # Default weight if no conditions match
                )
            )
        )
    )

    # Calculate the weighted loss
    weighted_loss = abs_error * weights

    # Return the mean of the weighted loss
    return tf.reduce_mean(weighted_loss)

def asymmetric_loss(y_true, y_pred):
    # Calculate the difference between the true and predicted values
    error = y_true - y_pred
    
    # Apply different weights for under-predictions and over-predictions
    under_prediction_loss = tf.where(error > 0, error * 2.0, error)  # Penalize under-prediction more
    over_prediction_loss = tf.where(error <= 0, error * 1.0, error)   # Regular penalty for over-prediction
    
    # Combine the losses and return the mean absolute value
    weighted_error = tf.abs(under_prediction_loss + over_prediction_loss)
    
    return tf.reduce_mean(weighted_error)

def custom_activation(x):
    return tf.clip_by_value(x, 0, 12)

In [None]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X2, y_clipped2, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val, xP_train, xP_val = train_test_split(X, y_clipped, xP, test_size=0.2, random_state=42)

# # Normalize the features (optional but recommended)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

# Create mask
max_values = np.max(X_train, axis=0)
mask = max_values > 1

# Apply scaling only to the selected features
scaler = StandardScaler()
X_train[:, mask] = scaler.fit_transform(X_train[:, mask])  # Scale features in training set
X_val[:, mask] = scaler.transform(X_val[:, mask])  # Scale features in validation set

In [None]:
# Define model
model = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    BatchNormalization(),
    layers.Dense(1)
])

# Compile the model with a suitable optimizer and loss function
model.compile(optimizer='adam', loss=weighted_mae, metrics=['mae'])
# model.compile(optimizer='adam', loss=asymmetric_loss, metrics=['mae'])
# model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)

# Learning rate scheduler to adjust the learning rate dynamically
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

# Fit the model with early stopping and learning rate reduction
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=100, 
                    batch_size=64, 
#                     callbacks=[early_stopping]
                   )

# Evaluate the model on validation data
test_loss, test_mae = model.evaluate(X_val, y_val)
print(f'Test Mean Absolute Error: {test_mae:.2f}')

# Make predictions and clip them
predictions = model.predict(X_val)
clipped_predictions = np.clip(predictions, 0, 12)  # Clip predictions to range [0, 12]

# Optionally, plot training history (loss and metrics)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

### Show predictions in training set

In [None]:
# Make predictions on the val data
predictions = model.predict(X_train[:1000])

# Sort values for a better line plot
sorted_indices = np.argsort(y_train[:1000])
sorted_y_train = y_train[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_y_train, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Predictions vs True Values for Training Set')
plt.legend()
plt.grid(True)
plt.show()

### Show results on valldation set

In [None]:
# Make predictions on the val data
predictions = model.predict(X_val)

# Sort values for a better line plot
sorted_indices = np.argsort(y_val)
sorted_y_val = y_val[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_y_val, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Predictions vs True Values for Validation Set')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(xP_val)
sorted_xP_val = xP_val[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.plot(sorted_xP_val, label='Expected points', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Predictions vs xP - Validation Set')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Sort values for a better line plot
sorted_indices = np.argsort(y_val)
sorted_y_val = y_val[sorted_indices]
sorted_xP_val = xP_val[sorted_indices]


# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_xP_val, label='Expected Points', color='orange', linewidth=2)
plt.plot(sorted_y_val, label='True Values', color='blue', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('xP vs True Values - Validation Set')
plt.legend()
plt.grid(True)
plt.show()

### Test model on current season

In [None]:
# Get test data
X_test = X_GW2.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW2.copy()
player_ids_test = ids_GW2.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW3.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW3.copy()
player_ids_test = ids_GW3.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW4.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW4.copy()
player_ids_test = ids_GW4.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW5.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW5.copy()
player_ids_test = ids_GW5.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW6.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW6.copy()
player_ids_test = ids_GW6.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW7.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW7.copy()
player_ids_test = ids_GW7.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs Actual points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW8.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW8.copy()
player_ids_test = ids_GW8.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Actual score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:10]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs FPL expected values')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW9.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW9.copy()
player_ids_test = ids_GW9.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-15:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t Expected FPL score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:10]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs FPL expected values')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get test data
X_test = X_GW10.copy()
X_test[:, mask] = scaler.transform(X_test[:, mask])
y_test = y_GW10.copy()
player_ids_test = ids_GW10.copy()

# Make predictions on the test data
predictions = model.predict(X_test)
predictions = np.clip(predictions, 0, 24)

# Highest predictions (top 10)
top_10_indices = np.argsort(predictions.flatten())[-10:]
top_10_indices = top_10_indices[np.argsort(predictions.flatten()[top_10_indices])[::-1]]

# Print the top 10 predictions
print("Top 10 highest predicted players:")
for num, i in enumerate(top_10_indices):
    pp = player_overall_stats[str(player_ids_test[i])]
    print(f'Model\'s number {num+1}: {pp["web_name"]} \t Expected score: {predictions[i][0]:.2f} \t FPL expected score: {y_test[i]}')

# Initialize an empty dictionary to store top 5 players for each position
top_players_by_position = {'GK': [], 'DEF': [], 'MID': [], 'FWD': []}

# Define position mapping
position_mapping = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}

# Flatten the predictions array for easier sorting
predictions_flat = predictions.flatten()

# Iterate through each player, filtering by position and appending the data
for i in range(len(predictions)):
    player_id = str(player_ids_test[i])
    pp = player_overall_stats[player_id]
    position = pp['element_type']
    
    # Store the player data as a tuple (predicted_score, actual_score, web_name)
    player_data = (predictions_flat[i], y_test[i], pp['web_name'])
    
    # Append to the respective position list
    top_players_by_position[position_mapping[position]].append(player_data)

# Sort and extract top 5 players for each position
for position, players in top_players_by_position.items():
    # Sort by predicted score in descending order
    players_sorted = sorted(players, key=lambda x: x[0], reverse=True)
    
    # Get top 5 players
    top_5 = players_sorted[:5]
    
    print(f"\nTop 5 for {position}:")
    for num, player in enumerate(top_5, 1):
        predicted_score, actual_score, web_name = player
        print(f"{num}. {web_name} \t Predicted: {predicted_score:.2f} \t Actual: {actual_score}")

# Sort values for a better line plot (actual vs predicted)
sorted_indices = np.argsort(y_test)
sorted_y_test = y_test[sorted_indices]
sorted_predictions = predictions[sorted_indices]

# Plot expected values and true values
plt.figure(figsize=(10, 6))
plt.plot(sorted_y_test, label='FPL EP', color='blue', linewidth=2)
plt.plot(sorted_predictions, label='Predictions', color='orange', linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Values')
plt.title('Model expected values vs FPL expected values')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get unique values and their counts
unique_values, counts = np.unique(y, return_counts=True)

# Combine the results into a dictionary for better readability
value_counts = dict(zip(unique_values, counts))

value_counts