In [2]:
# importing sqlite to connect to the database
# importing pandas for data manipulation
# importing requests to get data from the api
import pandas as pd
import sqlite3
import numpy as np
import requests

In [3]:
# Feature Engineering for FPL predictiions
# Creating predictive feature from player data

# Connecting to the database
conn = sqlite3.connect('fpl_data.db')

# Geting all the data
# querying the database to get player information form gameweek table
query = '''
    SELECT p.player_id, p.name, p.position, p.team,
           g.gameweek, g.minutes, g.goals_scored, g.assists,
           g.clean_sheets, g.bonus, g.total_points
    FROM gameweek_data g
    JOIN players p ON g.player_id = p.player_id
    ORDER BY p.player_id, g.gameweek
'''

# storing the data that was got from the query in a dataframe(table)
df = pd.read_sql_query(query, conn)
# closing the connection
conn.close()

# loading the whole dataframe/table
# Count number of players in the dataframe & the gameweek
print(f"Loaded {len(df)} records")
print(f"Players: {df['player_id'].nunique()}")
print(f"Gameweeks: {df['gameweek'].min()} to {df['gameweek'].max()}")
# printing out some of the dataframe table
print(df.head())

Loaded 8063 records
Players: 752
Gameweeks: 1 to 11
   player_id  name position     team  gameweek  minutes  goals_scored  \
0          1  Raya       GK  Arsenal         1       90             0   
1          1  Raya       GK  Arsenal         2       90             0   
2          1  Raya       GK  Arsenal         3       90             0   
3          1  Raya       GK  Arsenal         4       90             0   
4          1  Raya       GK  Arsenal         5       90             0   

   assists  clean_sheets  bonus  total_points  
0        0             1      3            10  
1        0             1      0             6  
2        0             0      0             2  
3        0             1      0             6  
4        0             0      0             2  


In [4]:
# Feature for rolling Averages 
# gets the players recent form from the lat 5 games
def rollingAVG(df, window=5):
    """ 
    Calculating the rolling average of total points for each player

    DataFrame with player data
    window number od previous games to average

    returns the average rolling average points
    """
    # window set in the parameter of the function
    print(f"Rolling average - last {window} games")

    # Sorting the data by player & gameweek
    df = df.sort_values(['player_id', 'gameweek'])
    
    # Calculating the rolling averages for each player, total points column, last 5 games
    # rolling average is calculated after the last 5 games 
    # .mean calculates the average
    # .shift(1) uses the previous games
    df['rolling_avg_points'] = df.groupby('player_id')['total_points'].transform(
        lambda x: x.rolling(window=window, min_periods=window).mean().shift(1)
    )

    # filling nan values with 0
    # e.g. gw 1-5 has no rolling average as not enough games have been played 
    # nan would break ml models so it is replaced with a 0 
    df['rolling_avg_points'] = df['rolling_avg_points'].fillna(0)
    
    # returning the daataframe with rolling averages 
    return df

# storing the data in a dataframe
df = rollingAVG(df, window=5)

# showing how the rolling averages works 
print("\nRolling average data")
print("\nHaaland's form:")
eg = df[df['name'].str.contains('Haaland', na=False)][
    ['name', 'gameweek', 'total_points', 'rolling_avg_points']
].head(10)
print(eg)

Rolling average - last 5 games

Rolling average data

Haaland's form:
         name  gameweek  total_points  rolling_avg_points
4719  Haaland         1            13                 0.0
4720  Haaland         2             2                 0.0
4721  Haaland         3             9                 0.0
4722  Haaland         4            13                 0.0
4723  Haaland         5             9                 0.0
4724  Haaland         6            16                 9.2
4725  Haaland         7             8                 9.8
4726  Haaland         8            13                11.0
4727  Haaland         9             2                11.8
4728  Haaland        10            13                 9.6


In [5]:
# saving the engineering features
print("\nFeatures saved")

# saving to a csv file 
# index does not allow it to add extra row numbers 
df.to_csv('fpl_features.csv', index=False)
print("Saving features to 'fpl_features.csv'")

# saving to the database for fast querying in Ml training
# if it exists replace new with the old data
conn = sqlite3.connect('fpl_data.db')
df.to_sql('features', conn, if_exists='replace', index=False)
conn.close()
print("Saved to database features")

print(f"\nFeature engineering done")
print(f"Total records: {len(df)}")
print(f"Feature: rollingAverages - last 5 games")



Features saved
Saving features to 'fpl_features.csv'
Saved to database features

Feature engineering done
Total records: 8063
Feature: rollingAverages - last 5 games


In [6]:
# feature that predicts player performance based on
# the oppenet the face on the upcomig gameweek
# players will predict higher points if they face a weak team

# getting the team name from the api 
# mapping the team id to the team name
print("Getting team mappings")

# getting data that holds all the teams info
bootstrap_url = "https://fantasy.premierleague.com/api/bootstrap-static/"
bootstrap_data = requests.get(bootstrap_url).json()

# creating a dictionary to map the team id's to the team name 
GetTeamName = {team['id']: team['name'] for team in bootstrap_data['teams']}

# printing all the teams
print(f"{len(GetTeamName)} teams")

Getting team mappings
20 teams


In [8]:
# getting the fixtures of what team plays who
print("Getting fixtures")

# getting the fixtures from the FPL API
fixtures_url = "https://fantasy.premierleague.com/api/fixtures/"
fixtures_data = requests.get(fixtures_url).json()

# creating a fixtures dataframe
# to store the data that was got from the query in a dataframe(table)
fixtures_list = []
for fixture in fixtures_data:
    # only has the games that are assigned event
    if fixture['event']:
        # home team's view
        fixtures_list.append({
            'gameweek': fixture['event'],
            'team': GetTeamName[fixture['team_h']],      
            'opponent': GetTeamName[fixture['team_a']],  
            'is_home': True
        })
        # away team's view
        fixtures_list.append({
            'gameweek': fixture['event'],
            'team': GetTeamName[fixture['team_a']], 
            'opponent': GetTeamName[fixture['team_h']],
            'is_home': False
        })

# converting the list to a dataframe table
fixtures_df = pd.DataFrame(fixtures_list)

# amount of fixtures it gets 
# printing out the fixtures for gameweek 1
print(f"{len(fixtures_df)} fixtures")
print("\nfixtures GW1")
print(fixtures_df[fixtures_df['gameweek'] == 1].head(10))

Getting fixtures
760 fixtures

fixtures GW1
   gameweek         team     opponent  is_home
0         1    Liverpool  Bournemouth     True
1         1  Bournemouth    Liverpool    False
2         1  Aston Villa    Newcastle     True
3         1    Newcastle  Aston Villa    False
4         1     Brighton       Fulham     True
5         1       Fulham     Brighton    False
6         1        Spurs      Burnley     True
7         1      Burnley        Spurs    False
8         1   Sunderland     West Ham     True
9         1     West Ham   Sunderland    False
