In [11]:
# importing sqlite to connect to the database
# importing pandas for data manipulation
# importing requests to get data from the api
import pandas as pd
import sqlite3
import numpy as np
import requests

In [12]:
# Feature Engineering for FPL predictiions
# Creating predictive feature from player data

# Connecting to the database
conn = sqlite3.connect('fpl_data.db')

# Geting all the data
# querying the database to get player information form gameweek table
query = '''
    SELECT p.player_id, p.name, p.position, p.team,
           g.gameweek, g.minutes, g.goals_scored, g.assists,
           g.clean_sheets, g.bonus, g.total_points
    FROM gameweek_data g
    JOIN players p ON g.player_id = p.player_id
    ORDER BY p.player_id, g.gameweek
'''

# storing the data that was got from the query in a dataframe(table)
df = pd.read_sql_query(query, conn)
# closing the connection
conn.close()

# loading the whole dataframe/table
# Count number of players in the dataframe & the gameweek
print(f"Loaded {len(df)} records")
print(f"Players: {df['player_id'].nunique()}")
print(f"Gameweeks: {df['gameweek'].min()} to {df['gameweek'].max()}")
# printing out some of the dataframe table
print(df.head())

Loaded 15760 records
Players: 795
Gameweeks: 1 to 21
   player_id  name position     team  gameweek  minutes  goals_scored  \
0          1  Raya       GK  Arsenal         1       90             0   
1          1  Raya       GK  Arsenal         2       90             0   
2          1  Raya       GK  Arsenal         3       90             0   
3          1  Raya       GK  Arsenal         4       90             0   
4          1  Raya       GK  Arsenal         5       90             0   

   assists  clean_sheets  bonus  total_points  
0        0             1      3            10  
1        0             1      0             6  
2        0             0      0             2  
3        0             1      0             6  
4        0             0      0             2  


In [13]:
# Feature for rolling Averages 
# gets the players recent form from the lat 5 games
def rollingAVG(df, window=5):
    """ 
    Calculating the rolling average of total points for each player

    DataFrame with player data
    window number od previous games to average

    returns the average rolling average points
    """
    # window set in the parameter of the function
    print(f"Rolling average - last {window} games")

    # Sorting the data by player & gameweek
    df = df.sort_values(['player_id', 'gameweek'])
    
    # Calculating the rolling averages for each player, total points column, last 5 games
    # rolling average is calculated after the last 5 games 
    # .mean calculates the average
    # .shift(1) uses the previous games
    df['rolling_avg_points'] = df.groupby('player_id')['total_points'].transform(
        lambda x: x.rolling(window=window, min_periods=window).mean().shift(1)
    )

    # filling nan values with 0
    # e.g. gw 1-5 has no rolling average as not enough games have been played 
    # nan would break ml models so it is replaced with a 0 
    df['rolling_avg_points'] = df['rolling_avg_points'].fillna(0)
    
    # returning the daataframe with rolling averages 
    return df

# storing the data in a dataframe
df = rollingAVG(df, window=5)

# showing how the rolling averages works 
print("\nRolling average data")
print("\nHaaland's form:")
eg = df[df['name'].str.contains('Haaland', na=False)][
    ['name', 'gameweek', 'total_points', 'rolling_avg_points']
].head(10)
print(eg)

# checking the correlation between rolling average & points scored
# looking to see that the players with good form socre more points
print("\nCorrelation with points")
corr = df['rolling_avg_points'].corr(df['total_points'])
print(f"r = {corr:.3f}")

if corr > 0.5:
    print("Strong correlation, good form predicts more points")
elif corr > 0.3:
    print("Moderate correlation, form is a useful predictor")
else:
    print("Weak correlation")

Rolling average - last 5 games

Rolling average data

Haaland's form:
         name  gameweek  total_points  rolling_avg_points
9009  Haaland         1            13                 0.0
9010  Haaland         2             2                 0.0
9011  Haaland         3             9                 0.0
9012  Haaland         4            13                 0.0
9013  Haaland         5             9                 0.0
9014  Haaland         6            16                 9.2
9015  Haaland         7             8                 9.8
9016  Haaland         8            13                11.0
9017  Haaland         9             2                11.8
9018  Haaland        10            13                 9.6

Correlation with points
r = 0.411
Moderate correlation, form is a useful predictor


In [14]:
# saving the engineering features
print("\nFeatures saved")

# saving to a csv file 
# index does not allow it to add extra row numbers 
df.to_csv('fpl_features.csv', index=False)
print("Saving features to 'fpl_features.csv'")

# saving to the database for fast querying in Ml training
# if it exists replace new with the old data
conn = sqlite3.connect('fpl_data.db')
df.to_sql('features', conn, if_exists='replace', index=False)
conn.close()
print("Saved to database features")

print(f"\nFeature engineering done")
print(f"Total records: {len(df)}")
print(f"Feature: rollingAverages - last 5 games")



Features saved
Saving features to 'fpl_features.csv'
Saved to database features

Feature engineering done
Total records: 15760
Feature: rollingAverages - last 5 games


In [15]:
# feature that predicts player performance based on
# the oppenet the face on the upcomig gameweek
# players will predict higher points if they face a weak team

# getting the team name from the api 
# mapping the team id to the team name
print("Getting team mappings")

# getting data that holds all the teams info
bootstrap_url = "https://fantasy.premierleague.com/api/bootstrap-static/"
bootstrap_data = requests.get(bootstrap_url).json()

# creating a dictionary to map the team id's to the team name 
GetTeamName = {team['id']: team['name'] for team in bootstrap_data['teams']}

# printing all the teams
print(f"{len(GetTeamName)} teams")

Getting team mappings
20 teams


In [16]:
# getting the fixtures of what team plays who
print("Getting fixtures")

# getting the fixtures from the FPL API
fixtures_url = "https://fantasy.premierleague.com/api/fixtures/"
fixtures_data = requests.get(fixtures_url).json()

# creating a fixtures dataframe
# to store the data that was got from the query in a dataframe(table)
fixtures_list = []
for fixture in fixtures_data:
    # only has the games that are assigned event
    if fixture['event']:
        # home team's view
        fixtures_list.append({
            'gameweek': fixture['event'],
            'team': GetTeamName[fixture['team_h']],      
            'opponent': GetTeamName[fixture['team_a']],  
            'is_home': True
        })
        # away team's view
        fixtures_list.append({
            'gameweek': fixture['event'],
            'team': GetTeamName[fixture['team_a']], 
            'opponent': GetTeamName[fixture['team_h']],
            'is_home': False
        })

# converting the list to a dataframe table
fixtures_df = pd.DataFrame(fixtures_list)

# amount of fixtures it gets 
# printing out the fixtures for gameweek 1
print(f"{len(fixtures_df)} fixtures")
print("\nfixtures GW1")
print(fixtures_df[fixtures_df['gameweek'] == 1].head(10))

Getting fixtures
760 fixtures

fixtures GW1
   gameweek         team     opponent  is_home
0         1    Liverpool  Bournemouth     True
1         1  Bournemouth    Liverpool    False
2         1  Aston Villa    Newcastle     True
3         1    Newcastle  Aston Villa    False
4         1     Brighton       Fulham     True
5         1       Fulham     Brighton    False
6         1        Spurs      Burnley     True
7         1      Burnley        Spurs    False
8         1   Sunderland     West Ham     True
9         1     West Ham   Sunderland    False


In [17]:
# Seeing how strong a teams defeense is
# a team with a weak defense has a high difficulty rating & vice versa

# get the player data with rolling averages
conn = sqlite3.connect('fpl_data.db')
df = pd.read_sql_query("SELECT * FROM features", conn)
conn.close()

# calculate the average points scored against each team
# the teams that conceded more points are easier to play 
teamDefense = df.groupby('team')['total_points'].mean().reset_index()
teamDefense.columns = ['team', 'avg_points_conceded']

# creating a difficulty rating 1 - 10 for each team
# 1 means the team has a strong defense & dosent concede many points
# while 10 means the team has a weak defense* concedes a lot of points
minPts = teamDefense['avg_points_conceded'].min()
maxPts = teamDefense['avg_points_conceded'].max()

# making the scale from 1-10
teamDefense['difficulty_rating'] = (
    ((maxPts - teamDefense['avg_points_conceded']) / (maxPts - minPts)) * 9 + 1
).round(1)


# for every game, put a difficulty rating on oppopnent based on their defense
# creating a dictionary with the team name and difficulty
opponentDifficulty = dict(zip(teamDefense['team'], teamDefense['difficulty_rating']))

# adding the opponent difficulty to the fixtures
fixtures_df['opponent_difficulty'] = fixtures_df['opponent'].map(opponentDifficulty)

# printing out the fixtures dataframe table for arsenal of their first 5 games
# with the opponent difficulty 
print(fixtures_df[fixtures_df['team'] == 'Arsenal'].head())



    gameweek     team       opponent  is_home  opponent_difficulty
17         1  Arsenal        Man Utd    False                  7.0
30         2  Arsenal          Leeds     True                  5.5
57         3  Arsenal      Liverpool    False                  4.7
60         4  Arsenal  Nott'm Forest     True                  7.9
98         5  Arsenal       Man City     True                  1.0


In [18]:
# adding the oponent difficulty rating to each players gameweek data
# combineing the fixtures with the player data
#
df = df.merge(
    fixtures_df[['gameweek', 'team', 'opponent', 'opponent_difficulty', 'is_home']], 
    # matching by gameweek and team
    on=['gameweek', 'team'],
    # keeing all the players even if no fixture is found
    how='left'
)

# converting is_home from true/false to 1/0 
# boolean to int
df['is_home'] = df['is_home'].astype(int)

# make any missing values default to 5.0 difficulty
# & making any missing is_home values 0
# handles edge cases where data is missing
df['opponent_difficulty'] = df['opponent_difficulty'].fillna(5.0)
df['is_home'] = df['is_home'].fillna(0).astype(int)

# statistics of home & away records
records = len(df)
home_records = df['is_home'].sum()
away_records = records - home_records

print(f"Records: {records}")
print(f"Home games: {home_records} ({home_records/records*100:.1f}%)")
print(f"Away games: {away_records} ({away_records/records*100:.1f}%)")

# checking home advantage
home_avg = df[df['is_home'] == 1]['total_points'].mean()
away_avg = df[df['is_home'] == 0]['total_points'].mean()
print(f"\nAverage points at home: {home_avg:.2f}")
print(f"Average points away: {away_avg:.2f}")
print(f"Home advantage: +{home_avg - away_avg:.2f} points")

# e.g. for haaland
print("\nHaaland's fixtures with opponent difficulty")
haaland = df[df['name'].str.contains('Haaland', na=False)][
    ['name', 'gameweek', 'team', 'opponent', 'total_points', 
     'rolling_avg_points', 'opponent_difficulty']
].head(10)
print(haaland)

# checking the correlation between oponent difficulty & points scored
# looking to see that the players that face easy opponents score more points
print("\nCorrelation with points:")

opp_corr = df['opponent_difficulty'].corr(df['total_points'])
print(f"opponent difficulty: r = {opp_corr:.3f}")

if opp_corr > 0.0:
    print("+ correlation shows easier opponents lead to more points!")
elif opp_corr < 0.0:
    print("- correlation shows an unexpected pattern")
else:
    print("Weak correlation means the feature is not implemented properly")

# checking is_home correlation to see if players get more points when they play at home
is_home_corr = df['is_home'].corr(df['total_points'])
print(f"is_home: r = {is_home_corr:.3f}")

if is_home_corr > 0.05:
    print("Players score more points at home")
elif is_home_corr > 0.0:
    print("Slight home advantage detected")
else:
    print("No home advantage found")
""""
# saving the dataset with all features
print("\nSaving features")

# saving to the database for fast querying in Ml training
# if it exists replace new with the old data
conn = sqlite3.connect('fpl_data.db')
df.to_sql('features', conn, if_exists='replace', index=False)
conn.close()

print("Saved features")
print(f"Total records: {len(df)}")
"""

Records: 15760
Home games: 7870 (49.9%)
Away games: 7890 (50.1%)

Average points at home: 1.30
Average points away: 1.13
Home advantage: +0.17 points

Haaland's fixtures with opponent difficulty
         name  gameweek      team     opponent  total_points  \
9009  Haaland         1  Man City       Wolves            13   
9010  Haaland         2  Man City        Spurs             2   
9011  Haaland         3  Man City     Brighton             9   
9012  Haaland         4  Man City      Man Utd            13   
9013  Haaland         5  Man City      Arsenal             9   
9014  Haaland         6  Man City      Burnley            16   
9015  Haaland         7  Man City    Brentford             8   
9016  Haaland         8  Man City      Everton            13   
9017  Haaland         9  Man City  Aston Villa             2   
9018  Haaland        10  Man City  Bournemouth            13   

      rolling_avg_points  opponent_difficulty  
9009                 0.0                 10.0  
9010

'"\n# saving the dataset with all features\nprint("\nSaving features")\n\n# saving to the database for fast querying in Ml training\n# if it exists replace new with the old data\nconn = sqlite3.connect(\'fpl_data.db\')\ndf.to_sql(\'features\', conn, if_exists=\'replace\', index=False)\nconn.close()\n\nprint("Saved features")\nprint(f"Total records: {len(df)}")\n'

In [19]:
# feature to show that the minutes a player plays impacts their score
# a player who dosent play will score 0 points
# printing the minutes stats
print("Minutes stats")
print(df['minutes'].describe())

# checking the correlation between minutes played & points scored
# looking to see that players who play more score more points
print("\nCorrelation with points:")
corr = df['minutes'].corr(df['total_points'])
print(f"r = {corr:.3f}")

if corr > 0.5:
    print("Strong correlation, the more minutes played the more points scored")
else:
    print("Weak correlation")

# showing that the minutes played works
print("\nHaaland's minutes played v points scored")
haaland_minutes = df[df['name'].str.contains('Haaland', na=False)][
    ['name', 'gameweek', 'minutes', 'total_points']
].head(10)
print(haaland_minutes)


Minutes stats
count    15760.000000
mean        26.247462
std         37.676007
min          0.000000
25%          0.000000
50%          0.000000
75%         68.000000
max         90.000000
Name: minutes, dtype: float64

Correlation with points:
r = 0.682
Strong correlation, the more minutes played the more points scored

Haaland's minutes played v points scored
         name  gameweek  minutes  total_points
9009  Haaland         1       72            13
9010  Haaland         2       90             2
9011  Haaland         3       90             9
9012  Haaland         4       86            13
9013  Haaland         5       75             9
9014  Haaland         6       90            16
9015  Haaland         7       90             8
9016  Haaland         8       90            13
9017  Haaland         9       90             2
9018  Haaland        10       81            13


In [20]:
# price feature to show that expensive players score more points
# than less expensive players
print("Price")

# fpl api provides currennt player prices in the bootstrap-static endpoint
# the prices are stored as 'new_cost' in tenths
# a players new_cost that is 100 means they cost £10.0m
try:
    # getting data that holds all the players info
    print("\nGetting player prices from FPL API")
    url = "https://fantasy.premierleague.com/api/bootstrap-static/"
    response = requests.get(url)
    data = response.json()

    # extracting player prices from the api response
    # creating a dictionary to map the player id's to the player price
    print("Processing price data")
    price_dict = {}
    for player in data['elements']:
        player_id = player['id']
        # didviding by 10 to get the price in millions
        price = player['now_cost'] / 10.0
        price_dict[player_id] = price

    # adding the price column to the dataframe by mapping player_id to price_dict
    df['price'] = df['player_id'].map(price_dict)

    # handling missing values
    # some players may not have a price in the api data
    # filling any missing prices with the average price
    missing = df['price'].isna().sum()
    if missing > 0:
        avg_price = df['price'].mean()
        df['price'] = df['price'].fillna(avg_price)
        print(f"Filled {missing} missing prices with average price £{avg_price:.1f}m")

    # checking o see if the data was added correctly
    print(f"Range: £{df['price'].min():.1f}m - £{df['price'].max():.1f}m")
    print(f"Average: £{df['price'].mean():.1f}m")
    print(f"Median: £{df['price'].median():.1f}m")

    # check price distribution
    # showing how many players are in each price range
    # £4-6m budget players, £6-8m mid-range, £8-12m expensive, £12m+ elite
    print(f"\nPrice distribution:")
    print(f"Budget (£4-6m): {len(df[df['price'] < 6.0])} records")
    print(f"Mid (£6-8m): {len(df[(df['price'] >= 6.0) & (df['price'] < 8.0)])} records")
    print(f"Premium (£8-10m): {len(df[(df['price'] >= 8.0) & (df['price'] < 10.0)])} records")
    print(f"Elite (£10m+): {len(df[df['price'] >= 10.0])} records")

    # checking the correlation between player price & points scored
    # looking to see that expensive players score more points
    corr = df['price'].corr(df['total_points'])
    print(f"\nCorrelation with points: r = {corr:.3f}")

    # correlation strength
    if corr > 0.3:
        print("Strong correlation - expensive players score more!")
    elif corr > 0.2:
        print("Moderate correlation")
    else:
        print("Weak correlation")

    # Calculating average points per price bracket
    # shows the linear relationship between price & points scored
    print("\nAverage points per price bracket:")
    brackets = [(0, 5), (5, 6), (6, 7), (7, 8), (8, 10), (10, 20)]
    for low, high in brackets:
        bracket_df = df[(df['price'] >= low) & (df['price'] < high)]
        avg_points = bracket_df['total_points'].mean()
        print(f"£{low}-{high}m: {avg_points:.2f} points over {len(bracket_df)} records")

    # showing the top 5 most expensive players in df
    print("\nTop 5 most expensive players:")
    # drop_duplicates to show each player once (not every gameweek)
    top5 = df[['name', 'price']].drop_duplicates('name').nlargest(5, 'price')
    for _, row in top5.iterrows():
        print(f"{row['name']}: £{row['price']:.1f}m")
    
    # showing example data for haaland
    print("\nHaaland's price v points scored")
    haaland_price = df[df['name'].str.contains('Haaland', na=False)][
        ['name', 'gameweek', 'price', 'total_points']
    ].head(5)
    print(haaland_price.to_string(index=False))

except Exception as e:
    # if fpl api cannot be accessed 
    # adding a price colum filled with 0 tin  prevent breaking code
    # allowing the rest of the feature engineering to work
    print(f"Error accessing FPL API: {e}")
    df['price'] = 0

Price

Getting player prices from FPL API


Processing price data
Range: £3.7m - £15.1m
Average: £4.9m
Median: £4.7m

Price distribution:
Budget (£4-6m): 13902 records
Mid (£6-8m): 1522 records
Premium (£8-10m): 231 records
Elite (£10m+): 105 records

Correlation with points: r = 0.307
Strong correlation - expensive players score more!

Average points per price bracket:
£0-5m: 0.71 points over 10038 records
£5-6m: 1.87 points over 3864 records
£6-7m: 2.18 points over 1021 records
£7-8m: 3.20 points over 501 records
£8-10m: 2.82 points over 231 records
£10-20m: 3.71 points over 105 records

Top 5 most expensive players:
Haaland: £15.1m
M.Salah: £14.0m
Palmer: £10.4m
Isak: £10.3m
Saka: £10.1m

Haaland's price v points scored
   name  gameweek  price  total_points
Haaland         1   15.1            13
Haaland         2   15.1             2
Haaland         3   15.1             9
Haaland         4   15.1            13
Haaland         5   15.1             9
