In [1]:
#LOAD GOAL STATS

import soccerdata as sd
understat = sd.Understat(leagues="FRA-Ligue 1", seasons="2023/2024")
team_match_stats = understat.read_team_match_stats()
team_match_stats.head()

import pandas as pd

# Assuming 'team_match_stats' is your DataFrame loaded from the code you've provided

# Calculating total xG for teams when they are at home
home_goals = team_match_stats.groupby('home_team')['home_goals'].sum().rename('total_home_goals_scored')

# Calculating total xG for teams when they are away
away_goals = team_match_stats.groupby('away_team')['away_goals'].sum().rename('total_away_goals_scored')

# Calculating total xGA (expected goals against) for teams when they are at home
home_goals_allowed = team_match_stats.groupby('home_team')['away_goals'].sum().rename('total_home_goals_allowed')

# Calculating total xGA for teams when they are away
away_goals_allowed = team_match_stats.groupby('away_team')['home_goals'].sum().rename('total_away_goals_allowed')

# Merging the above series into a single DataFrame
teams_goals_goalsallowed = pd.concat([home_goals, away_goals, home_goals_allowed, away_goals_allowed], axis=1)

# Filling NaN values with 0, in case some teams didn't play home or away games yet
teams_goals_goalsallowed = teams_goals_goalsallowed.fillna(0)

# Calculating overall total xG and xGA
teams_goals_goalsallowed['total_goals'] = teams_goals_goalsallowed['total_home_goals_scored'] + teams_goals_goalsallowed['total_away_goals_scored']
teams_goals_goalsallowed['total_goals_allowed'] = teams_goals_goalsallowed['total_home_goals_allowed'] + teams_goals_goalsallowed['total_away_goals_allowed']

teams_goals_goalsallowed


Unnamed: 0,total_home_goals_scored,total_away_goals_scored,total_home_goals_allowed,total_away_goals_allowed,total_goals,total_goals_allowed
Brest,24,25,13,20,49,33
Clermont Foot,14,12,26,28,26,54
Le Havre,17,16,17,25,33,42
Lens,25,17,16,18,42,34
Lille,32,16,12,19,48,31
Lorient,21,16,30,33,37,63
Lyon,23,23,28,26,46,54
Marseille,28,19,12,26,47,38
Metz,17,17,25,29,34,54
Monaco,26,36,14,28,62,42


In [2]:
#SET UP LAST 5 GAME GOAL STATS

import pandas as pd

# Assuming team_match_stats already loaded and contains the necessary columns

# Identify all unique teams
teams = pd.concat([team_match_stats['home_team'], team_match_stats['away_team']]).unique()

# Prepare a list to collect each team's average goals scored and allowed for home and away games
team_averages = []

# Iterate through each team
for team in teams:
    # Filter home games involving the current team
    team_home_games = team_match_stats[team_match_stats['home_team'] == team]
    # Sort home games by date in descending order and select the last 5
    team_last_5_home_games = team_home_games.sort_values(by='date', ascending=False).head(5)
    
    # Filter away games involving the current team
    team_away_games = team_match_stats[team_match_stats['away_team'] == team]
    # Sort away games by date in descending order and select the last 5
    team_last_5_away_games = team_away_games.sort_values(by='date', ascending=False).head(5)
    
    # Calculate average goals scored and allowed for home games
    team_home_goals_avg = team_last_5_home_games['home_goals'].mean()
    team_home_goals_allowed_avg = team_last_5_home_games['away_goals'].mean()
    
    # Calculate average goals scored and allowed for away games
    team_away_goals_avg = team_last_5_away_games['away_goals'].mean()
    team_away_goals_allowed_avg = team_last_5_away_games['home_goals'].mean()
    
    # Append the averages to the list
    team_averages.append({
        'Team': team,
        'Average Home Goals Last 5': team_home_goals_avg,
        'Average Home Goals Allowed Last 5': team_home_goals_allowed_avg,
        'Average Away Goals Last 5': team_away_goals_avg,
        'Average Away Goals Allowed Last 5': team_away_goals_allowed_avg
    })

# Convert the list of averages into a DataFrame
teams_goals_df = pd.DataFrame(team_averages)

# Display the DataFrame
teams_goals_df


Unnamed: 0,Team,Average Home Goals Last 5,Average Home Goals Allowed Last 5,Average Away Goals Last 5,Average Away Goals Allowed Last 5
0,Brest,1.2,1.2,2.4,1.8
1,Marseille,2.0,1.2,1.6,1.8
2,Nice,1.4,1.4,1.8,1.2
3,Paris Saint Germain,2.2,1.6,2.8,0.6
4,Clermont Foot,1.6,2.2,0.4,1.4
5,Montpellier,2.0,1.8,1.8,0.8
6,Nantes,0.4,2.6,0.8,0.8
7,Rennes,2.2,2.0,1.6,1.4
8,Strasbourg,1.2,1.6,1.2,1.4
9,Metz,1.6,2.2,1.4,1.8


In [3]:
#CALCULATE TOTAL MATCHES PLAYED FOR EACH TEAM

# Count the number of home matches played by each team
home_matches_played = team_match_stats.groupby('home_team')['home_ppda'].count().rename('home_matches_played')

# Count the number of away matches played by each team
away_matches_played = team_match_stats.groupby('away_team')['away_ppda'].count().rename('away_matches_played')

# Merging these counts into the teams_xg_xga DataFrame
teams_goals_goalsallowed = teams_goals_goalsallowed.merge(home_matches_played, how='left', left_index=True, right_index=True)
teams_goals_goalsallowed = teams_goals_goalsallowed.merge(away_matches_played, how='left', left_index=True, right_index=True)

# Filling NaN values with 0 in case some teams didn't play home or away games
teams_goals_goalsallowed['home_matches_played'] = teams_goals_goalsallowed['home_matches_played'].fillna(0)
teams_goals_goalsallowed['away_matches_played'] = teams_goals_goalsallowed['away_matches_played'].fillna(0)

# Calculating the total matches played by adding home and away matches
teams_goals_goalsallowed['matches_played'] = teams_goals_goalsallowed['home_matches_played'] + teams_goals_goalsallowed['away_matches_played']

teams_goals_goalsallowed


Unnamed: 0,total_home_goals_scored,total_away_goals_scored,total_home_goals_allowed,total_away_goals_allowed,total_goals,total_goals_allowed,home_matches_played,away_matches_played,matches_played
Brest,24,25,13,20,49,33,16,16,32
Clermont Foot,14,12,26,28,26,54,16,16,32
Le Havre,17,16,17,25,33,42,16,16,32
Lens,25,17,16,18,42,34,16,16,32
Lille,32,16,12,19,48,31,16,16,32
Lorient,21,16,30,33,37,63,16,16,32
Lyon,23,23,28,26,46,54,16,16,32
Marseille,28,19,12,26,47,38,16,15,31
Metz,17,17,25,29,34,54,16,16,32
Monaco,26,36,14,28,62,42,16,16,32


In [4]:
#CALCULATE LEAGUE WIDE AVERAGES


# Calculate the averages for each specified column
averages = teams_goals_goalsallowed.mean()

# Creating a new DataFrame with the averages to append it properly with a label
averages_df = pd.DataFrame([averages], index=['Average'])

# Append the averages row to your original DataFrame
teams_goals_goalsallowed_averaged = pd.concat([teams_goals_goalsallowed, averages_df])

teams_goals_goalsallowed_averaged

Unnamed: 0,total_home_goals_scored,total_away_goals_scored,total_home_goals_allowed,total_away_goals_allowed,total_goals,total_goals_allowed,home_matches_played,away_matches_played,matches_played
Brest,24.0,25.0,13.0,20.0,49.0,33.0,16.0,16.0,32.0
Clermont Foot,14.0,12.0,26.0,28.0,26.0,54.0,16.0,16.0,32.0
Le Havre,17.0,16.0,17.0,25.0,33.0,42.0,16.0,16.0,32.0
Lens,25.0,17.0,16.0,18.0,42.0,34.0,16.0,16.0,32.0
Lille,32.0,16.0,12.0,19.0,48.0,31.0,16.0,16.0,32.0
Lorient,21.0,16.0,30.0,33.0,37.0,63.0,16.0,16.0,32.0
Lyon,23.0,23.0,28.0,26.0,46.0,54.0,16.0,16.0,32.0
Marseille,28.0,19.0,12.0,26.0,47.0,38.0,16.0,15.0,31.0
Metz,17.0,17.0,25.0,29.0,34.0,54.0,16.0,16.0,32.0
Monaco,26.0,36.0,14.0,28.0,62.0,42.0,16.0,16.0,32.0


In [5]:
#CALCULATE PER GAME AVERAGES FOR GOAL STATS

import numpy as np

# Create a new DataFrame for per match averages
teams_per_match_averages = pd.DataFrame(index=teams_goals_goalsallowed_averaged.index)

# Calculate per match averages for home and away xG and xGA
teams_per_match_averages['per_match_home_goals'] = teams_goals_goalsallowed_averaged['total_home_goals_scored'] / teams_goals_goalsallowed_averaged['home_matches_played']
teams_per_match_averages['per_match_away_goals'] = teams_goals_goalsallowed_averaged['total_away_goals_scored'] / teams_goals_goalsallowed_averaged['away_matches_played']
teams_per_match_averages['per_match_home_goals_allowed'] = teams_goals_goalsallowed_averaged['total_home_goals_allowed'] / teams_goals_goalsallowed_averaged['home_matches_played']
teams_per_match_averages['per_match_away_goals_allowed'] = teams_goals_goalsallowed_averaged['total_away_goals_allowed'] / teams_goals_goalsallowed_averaged['away_matches_played']

# Calculate overall total xG and xGA per match
teams_per_match_averages['total_goals_per_match'] = teams_goals_goalsallowed_averaged['total_goals'] / teams_goals_goalsallowed_averaged['matches_played']
teams_per_match_averages['total_goals_allowed_per_match'] = teams_goals_goalsallowed_averaged['total_goals_allowed'] / teams_goals_goalsallowed_averaged['matches_played']

# Replace any potential infinite values with NaN (in case of division by zero) and then fill with 0
teams_per_match_averages.replace([np.inf, -np.inf], np.nan, inplace=True)
teams_per_match_averages.fillna(0, inplace=True)

# Calculate the average for each column
average_stats = teams_per_match_averages.mean()

# Append the 'Average' row with these averages to the DataFrame
teams_per_match_averages.loc['Average'] = average_stats

teams_per_match_averages


Unnamed: 0,per_match_home_goals,per_match_away_goals,per_match_home_goals_allowed,per_match_away_goals_allowed,total_goals_per_match,total_goals_allowed_per_match
Brest,1.5,1.5625,0.8125,1.25,1.53125,1.03125
Clermont Foot,0.875,0.75,1.625,1.75,0.8125,1.6875
Le Havre,1.0625,1.0,1.0625,1.5625,1.03125,1.3125
Lens,1.5625,1.0625,1.0,1.125,1.3125,1.0625
Lille,2.0,1.0,0.75,1.1875,1.5,0.96875
Lorient,1.3125,1.0,1.875,2.0625,1.15625,1.96875
Lyon,1.4375,1.4375,1.75,1.625,1.4375,1.6875
Marseille,1.75,1.266667,0.75,1.733333,1.516129,1.225806
Metz,1.0625,1.0625,1.5625,1.8125,1.0625,1.6875
Monaco,1.625,2.25,0.875,1.75,1.9375,1.3125


In [7]:
#GET LIST OF UPCOMING GAMES

# Load the full schedule
schedule_df = understat.read_schedule()

# Convert the 'date' column to datetime format, then format it to keep only the date part
schedule_df['date'] = pd.to_datetime(schedule_df['date']).dt.date

import pandas as pd

# Assuming 'schedule_df' is your DataFrame and it's already loaded

# Make sure the 'date' column is in datetime format
schedule_df['date'] = pd.to_datetime(schedule_df['date'])

# Define the start and end dates for filtering
start_date = pd.to_datetime('2024-05-10')
end_date = pd.to_datetime('2024-05-14')

# Filter the DataFrame for games within the date range, and use .copy() to avoid SettingWithCopyWarning
filtered_games = schedule_df[(schedule_df['date'] >= start_date) & (schedule_df['date'] <= end_date)].copy()

# Create a new column 'game_matchup' by concatenating 'home_team', 'vs.', and 'away_team'
filtered_games['game_matchup'] = filtered_games['home_team'] + ' vs. ' + filtered_games['away_team']

# Now, create a separate DataFrame that contains only the 'game_matchup' column
games_list_df = filtered_games[['game_matchup']].reset_index(drop=True)

# Display the new DataFrame to verify
print(games_list_df)

                       game_matchup
0                   Brest vs. Reims
1                 Nice vs. Le Havre
2            Clermont Foot vs. Lyon
3             Marseille vs. Lorient
4            Montpellier vs. Monaco
5                  Nantes vs. Lille
6  Paris Saint Germain vs. Toulouse
7                   Rennes vs. Lens
8               Strasbourg vs. Metz


In [8]:
#CREATE HOME AND AWAY ATTACK "SCORES" BASED ON THEIR AVERAGES COMAPRED TO LEAGUE WIDE

import pandas as pd

# Assuming 'teams_per_match_averages' and 'games_list_df' are already defined

# Initialize an empty list for storing game metrics along with the matchup information
games_metrics_list = []

# Retrieve the 'Average' row for league-wide average stats
avg_stats = teams_per_match_averages.loc['Average']

# Looping through each game to calculate metrics using 'teams_per_match_averages'
for index, row in games_list_df.iterrows():
    # Extract home and away team names
    home_team, away_team = row['game_matchup'].split(' vs. ')
    
    # Retrieve team stats from 'teams_per_match_averages'
    home_team_stats = teams_per_match_averages.loc[home_team]
    away_team_stats = teams_per_match_averages.loc[away_team]
    
    # Calculate the metrics using the per match averages
    home_attack = home_team_stats['per_match_home_goals'] / avg_stats['per_match_home_goals']
    away_defence = away_team_stats['per_match_away_goals_allowed'] / avg_stats['per_match_away_goals_allowed']
    away_attack = away_team_stats['per_match_away_goals'] / avg_stats['per_match_away_goals']
    home_defence = home_team_stats['per_match_home_goals_allowed'] / avg_stats['per_match_home_goals_allowed']
    
    # Append the calculated metrics for this game to the list along with the game_matchup
    games_metrics_list.append({
        'Game Matchup': row['game_matchup'],
        'Home Attack': home_attack, 
        'Away Defence': away_defence,
        'Away Attack': away_attack, 
        'Home Defence': home_defence
    })

# Once all games are processed, create the DataFrame from the list
games_metrics_df = pd.DataFrame(games_metrics_list)
# Assuming 'games_metrics_df' already exists and contains the 'Game Matchup' column

# Splitting 'Game Matchup' into 'Home Team' and 'Away Team'
games_metrics_df[['Home Team', 'Away Team']] = games_metrics_df['Game Matchup'].str.split(' vs. ', expand=True)

# Merge home team averages
games_metrics_df = games_metrics_df.merge(
    teams_goals_df[['Team', 'Average Home Goals Last 5', 'Average Home Goals Allowed Last 5']],
    left_on='Home Team',
    right_on='Team',
    how='left'
).drop(columns=['Team'])  # Dropping the redundant 'Team' column after merge

# Merge away team averages
games_metrics_df = games_metrics_df.merge(
    teams_goals_df[['Team', 'Average Away Goals Last 5', 'Average Away Goals Allowed Last 5']],
    left_on='Away Team',
    right_on='Team',
    how='left',
    suffixes=('_home', '_away')
).drop(columns=['Team'])  # Dropping the redundant 'Team' column after merge

games_metrics_df


Unnamed: 0,Game Matchup,Home Attack,Away Defence,Away Attack,Home Defence,Home Team,Away Team,Average Home Goals Last 5,Average Home Goals Allowed Last 5,Average Away Goals Last 5,Average Away Goals Allowed Last 5
0,Brest vs. Reims,1.040055,1.213213,1.101403,0.654046,Brest,Reims,1.2,1.2,1.4,2.2
1,Nice vs. Le Havre,0.878269,1.083226,0.80102,0.482988,Nice,Le Havre,1.4,1.4,1.0,2.0
2,Clermont Foot vs. Lyon,0.606699,1.126555,1.151466,1.308092,Clermont Foot,Lyon,1.6,2.2,2.6,2.0
3,Marseille vs. Lorient,1.213398,1.429859,0.80102,0.603735,Marseille,Lorient,2.0,1.2,0.8,2.0
4,Montpellier vs. Monaco,0.866713,1.213213,1.802295,1.056535,Montpellier,Monaco,2.0,1.8,2.6,1.4
5,Nantes vs. Lille,0.69337,0.823252,0.80102,1.408714,Nantes,Lille,0.4,2.6,1.0,1.2
6,Paris Saint Germain vs. Toulouse,1.776761,0.866581,0.901148,0.955913,Paris Saint Germain,Toulouse,2.2,1.6,1.8,0.8
7,Rennes vs. Lens,1.430076,0.779923,0.851084,1.25778,Rennes,Lens,2.2,2.0,1.4,1.4
8,Strasbourg vs. Metz,0.866713,1.256542,0.851084,1.106847,Strasbourg,Metz,1.2,1.6,1.4,1.8


In [9]:
#FACTOR IN LAST 5 GAME AVERAGES AS ITS OWN SEPARATE WIDE COMPARED TO SEASONAL AVERAGES

import pandas as pd

# Assuming 'teams_per_match_averages', 'games_list_df', and 'teams_xg_xga_df' are already defined

# Retrieve season-long averages
season_avg_home_goals = teams_per_match_averages['per_match_home_goals'].mean()
season_avg_away_goals = teams_per_match_averages['per_match_away_goals'].mean()
season_avg_home_goals_allowed = teams_per_match_averages['per_match_home_goals_allowed'].mean()
season_avg_away_goals_allowed = teams_per_match_averages['per_match_away_goals_allowed'].mean()

# Initialize an empty list for storing game metrics along with the matchup information
games_metrics_list = []

# Loop through each game to calculate metrics using 'teams_per_match_averages'
for index, row in games_list_df.iterrows():
    home_team, away_team = row['game_matchup'].split(' vs. ')
    
    # Retrieve team stats from 'teams_per_match_averages'
    home_team_stats = teams_per_match_averages.loc[home_team]
    away_team_stats = teams_per_match_averages.loc[away_team]
    
    # Retrieve recent form from 'teams_xg_xga_df'
    home_recent_goals_scored = teams_goals_df.loc[teams_goals_df['Team'] == home_team, 'Average Home Goals Last 5'].values[0]
    away_recent_goals_scored = teams_goals_df.loc[teams_goals_df['Team'] == away_team, 'Average Away Goals Last 5'].values[0]
    home_recent_goals_allowed = teams_goals_df.loc[teams_goals_df['Team'] == home_team, 'Average Home Goals Allowed Last 5'].values[0]
    away_recent_goals_allowed = teams_goals_df.loc[teams_goals_df['Team'] == away_team, 'Average Away Goals Allowed Last 5'].values[0]
    
    # Compare recent form to season-long averages to get adjustment factors
    home_attack_factor = home_recent_goals_scored / home_team_stats['per_match_home_goals'] if home_team_stats['per_match_home_goals'] != 0 else 1
    away_defence_factor = away_recent_goals_allowed / away_team_stats['per_match_away_goals_allowed'] if away_team_stats['per_match_away_goals_allowed'] != 0 else 1
    away_attack_factor = away_recent_goals_scored / away_team_stats['per_match_away_goals'] if away_team_stats['per_match_away_goals'] != 0 else 1
    home_defence_factor = home_recent_goals_allowed / home_team_stats['per_match_home_goals_allowed'] if home_team_stats['per_match_home_goals_allowed'] != 0 else 1
    
    # Adjust metrics using the factors
    home_attack_adjusted = home_team_stats['per_match_home_goals'] * home_attack_factor / season_avg_home_goals
    away_defence_adjusted = away_team_stats['per_match_away_goals_allowed'] * away_defence_factor / season_avg_away_goals_allowed
    away_attack_adjusted = away_team_stats['per_match_away_goals'] * away_attack_factor / season_avg_away_goals
    home_defence_adjusted = home_team_stats['per_match_home_goals_allowed'] * home_defence_factor / season_avg_home_goals_allowed
    
    # Append the adjusted metrics for this game to the list
    games_metrics_list.append({
        'Game Matchup': row['game_matchup'],
        'Home Attack': home_attack_adjusted,
        'Away Defence': away_defence_adjusted,
        'Away Attack': away_attack_adjusted,
        'Home Defence': home_defence_adjusted
    })

# Create the DataFrame from the list
games_metrics_df = pd.DataFrame(games_metrics_list)

# Split 'Game Matchup' into 'Home Team' and 'Away Team'
games_metrics_df[['Home Team', 'Away Team']] = games_metrics_df['Game Matchup'].str.split(' vs. ', expand=True)

games_metrics_df


Unnamed: 0,Game Matchup,Home Attack,Away Defence,Away Attack,Home Defence,Home Team,Away Team
0,Brest vs. Reims,0.8321,1.525272,1.121255,0.966077,Brest,Reims
1,Nice vs. Le Havre,0.970783,1.386611,0.800897,1.12709,Nice,Le Havre
2,Clermont Foot vs. Lyon,1.109466,1.386611,2.082331,1.771141,Clermont Foot,Lyon
3,Marseille vs. Lorient,1.386833,1.386611,0.640717,0.966077,Marseille,Lorient
4,Montpellier vs. Monaco,1.386833,0.970628,2.082331,1.449116,Montpellier,Monaco
5,Nantes vs. Lille,0.277367,0.831966,0.800897,2.093167,Nantes,Lille
6,Paris Saint Germain vs. Toulouse,1.525516,0.554644,1.441614,1.288103,Paris Saint Germain,Toulouse
7,Rennes vs. Lens,1.525516,0.970628,1.121255,1.610128,Rennes,Lens
8,Strasbourg vs. Metz,0.8321,1.24795,1.121255,1.288103,Strasbourg,Metz


In [10]:
#GENERATE PROJECTED GOALS LIST

# Initialize an empty list for storing projected goals data
projected_goals_list = []

# Retrieve the league-wide average per match home and away xG
league_avg_per_match_home_goals = teams_per_match_averages.loc['Average', 'per_match_home_goals']
league_avg_per_match_away_goals = teams_per_match_averages.loc['Average', 'per_match_away_goals']

# Iterate over each row in the games_metrics_df to calculate projected goals
for index, row in games_metrics_df.iterrows():
    # Extract home and away team names from the game matchup
    home_team, away_team = row['Game Matchup'].split(' vs. ')

    # Calculate metrics using the teams' attack and defence strengths
    home_attack = row['Home Attack']
    away_defence = row['Away Defence']
    away_attack = row['Away Attack']
    home_defence = row['Home Defence']

    # Calculate projected home and away goals using the league-wide average per game xG
    projected_home_goals = home_attack * away_defence * league_avg_per_match_home_goals
    projected_away_goals = away_attack * home_defence * league_avg_per_match_away_goals

    # Calculate total projected goals
    total_projected_goals = projected_home_goals + projected_away_goals

    # Append the results to the list
    projected_goals_list.append({
        'Game Matchup': row['Game Matchup'],
        'Projected Home Goals': projected_home_goals,
        'Projected Away Goals': projected_away_goals,
        'Total Projected Goals': total_projected_goals
    })

# Convert the list to a DataFrame
projected_goals_df = pd.DataFrame(projected_goals_list)

# Display the DataFrame
projected_goals_df


Unnamed: 0,Game Matchup,Projected Home Goals,Projected Away Goals,Total Projected Goals
0,Brest vs. Reims,1.830448,1.3523,3.182748
1,Nice vs. Le Havre,1.941384,1.126916,3.068301
2,Clermont Foot vs. Lyon,2.218725,4.604258,6.822983
3,Marseille vs. Lorient,2.773406,0.772743,3.546149
4,Montpellier vs. Monaco,1.941384,3.76712,5.708505
5,Nantes vs. Lille,0.332809,2.092845,2.425653
6,Paris Saint Germain vs. Toulouse,1.220299,2.318228,3.538527
7,Rennes vs. Lens,2.135523,2.253833,4.389356
8,Strasbourg vs. Metz,1.497639,1.803066,3.300706


In [11]:
#LOADING IN XG STATS NOW

import soccerdata as sd
understat = sd.Understat(leagues="FRA-Ligue 1", seasons="2023/2024")
team_match_stats = understat.read_team_match_stats()
import pandas as pd

# Assuming 'team_match_stats' is your DataFrame loaded from the code you've provided

# Calculating total xG for teams when they are at home
home_xg = team_match_stats.groupby('home_team')['home_xg'].sum().rename('total_home_xg')

# Calculating total xG for teams when they are away
away_xg = team_match_stats.groupby('away_team')['away_xg'].sum().rename('total_away_xg')

# Calculating total xGA (expected goals against) for teams when they are at home
home_xga = team_match_stats.groupby('home_team')['away_xg'].sum().rename('total_home_xga')

# Calculating total xGA for teams when they are away
away_xga = team_match_stats.groupby('away_team')['home_xg'].sum().rename('total_away_xga')

# Merging the above series into a single DataFrame
teams_xg_xga = pd.concat([home_xg, away_xg, home_xga, away_xga], axis=1)

# Filling NaN values with 0, in case some teams didn't play home or away games yet
teams_xg_xga = teams_xg_xga.fillna(0)

# Calculating overall total xG and xGA
teams_xg_xga['total_xg'] = teams_xg_xga['total_home_xg'] + teams_xg_xga['total_away_xg']
teams_xg_xga['total_xga'] = teams_xg_xga['total_home_xga'] + teams_xg_xga['total_away_xga']


import pandas as pd

# Identify all unique teams
teams = pd.concat([team_match_stats['home_team'], team_match_stats['away_team']]).unique()

# Prepare a list to collect each team's average xG and xGA for home and away games
xg_team_averages = []

# Iterate through each team
for team in teams:
    # Filter home games involving the current team
    team_home_games = team_match_stats[team_match_stats['home_team'] == team]
    # Sort home games by date in descending order and select the last 5
    team_last_5_home_games = team_home_games.sort_values(by='date', ascending=False).head(5)
    
    # Filter away games involving the current team
    team_away_games = team_match_stats[team_match_stats['away_team'] == team]
    # Sort away games by date in descending order and select the last 5
    team_last_5_away_games = team_away_games.sort_values(by='date', ascending=False).head(5)
    
    # Calculate average xG and xGA for home games
    team_home_xg_avg = team_last_5_home_games['home_xg'].mean()
    team_home_xga_avg = team_last_5_home_games['away_xg'].mean()
    
    # Calculate average xG and xGA for away games
    team_away_xg_avg = team_last_5_away_games['away_xg'].mean()
    team_away_xga_avg = team_last_5_away_games['home_xg'].mean()
    
    # Append the averages to the list
    xg_team_averages.append({
        'Team': team,
        'Average Home xG Last 5': team_home_xg_avg,
        'Average Home xGA Last 5': team_home_xga_avg,
        'Average Away xG Last 5': team_away_xg_avg,
        'Average Away xGA Last 5': team_away_xga_avg
    })

# Convert the list of averages into a DataFrame
teams_xg_xga_df = pd.DataFrame(xg_team_averages)

# Display the DataFrame

# Count the number of home matches played by each team
home_matches_played = team_match_stats.groupby('home_team')['home_ppda'].count().rename('home_matches_played')

# Count the number of away matches played by each team
away_matches_played = team_match_stats.groupby('away_team')['away_ppda'].count().rename('away_matches_played')

# Merging these counts into the teams_xg_xga DataFrame
teams_xg_xga = teams_xg_xga.merge(home_matches_played, how='left', left_index=True, right_index=True)
teams_xg_xga = teams_xg_xga.merge(away_matches_played, how='left', left_index=True, right_index=True)

# Filling NaN values with 0 in case some teams didn't play home or away games
teams_xg_xga['home_matches_played'] = teams_xg_xga['home_matches_played'].fillna(0)
teams_xg_xga['away_matches_played'] = teams_xg_xga['away_matches_played'].fillna(0)

# Calculating the total matches played by adding home and away matches
teams_xg_xga['matches_played'] = teams_xg_xga['home_matches_played'] + teams_xg_xga['away_matches_played']

teams_xg_xga



Unnamed: 0,total_home_xg,total_away_xg,total_home_xga,total_away_xga,total_xg,total_xga,home_matches_played,away_matches_played,matches_played
Brest,29.708918,19.865128,14.758133,24.50759,49.574046,39.265723,16,16,32
Clermont Foot,19.838297,13.936614,28.05492,34.78967,33.774911,62.84459,16,16,32
Le Havre,22.716718,13.58552,20.868308,30.178028,36.302238,51.046336,16,16,32
Lens,33.825007,22.461238,20.849289,25.341595,56.286245,46.190884,16,16,32
Lille,34.641453,17.672871,15.890955,22.542001,52.314324,38.432956,16,16,32
Lorient,18.265608,14.687278,26.220587,30.074432,32.952886,56.295019,16,16,32
Lyon,29.275876,22.347663,25.959336,26.777911,51.623539,52.737247,16,16,32
Marseille,32.267001,19.664487,13.414974,19.912842,51.931488,33.327816,16,15,31
Metz,15.359754,16.905196,19.832621,34.717422,32.26495,54.550043,16,16,32
Monaco,30.17571,30.715155,16.762474,27.490906,60.890865,44.25338,16,16,32


In [12]:
#SAME CLEANING OF DATA WE DID FOR GOAL STATS


# Calculate the averages for each specified column
xg_averages = teams_xg_xga.mean()

# Creating a new DataFrame with the averages to append it properly with a label
xg_averages_df = pd.DataFrame([xg_averages], index=['Average'])

# Append the averages row to your original DataFrame
teams_xg_xga_with_averages = pd.concat([teams_xg_xga, xg_averages_df])

teams_xg_xga_with_averages

Unnamed: 0,total_home_xg,total_away_xg,total_home_xga,total_away_xga,total_xg,total_xga,home_matches_played,away_matches_played,matches_played
Brest,29.708918,19.865128,14.758133,24.50759,49.574046,39.265723,16.0,16.0,32.0
Clermont Foot,19.838297,13.936614,28.05492,34.78967,33.774911,62.84459,16.0,16.0,32.0
Le Havre,22.716718,13.58552,20.868308,30.178028,36.302238,51.046336,16.0,16.0,32.0
Lens,33.825007,22.461238,20.849289,25.341595,56.286245,46.190884,16.0,16.0,32.0
Lille,34.641453,17.672871,15.890955,22.542001,52.314324,38.432956,16.0,16.0,32.0
Lorient,18.265608,14.687278,26.220587,30.074432,32.952886,56.295019,16.0,16.0,32.0
Lyon,29.275876,22.347663,25.959336,26.777911,51.623539,52.737247,16.0,16.0,32.0
Marseille,32.267001,19.664487,13.414974,19.912842,51.931488,33.327816,16.0,15.0,31.0
Metz,15.359754,16.905196,19.832621,34.717422,32.26495,54.550043,16.0,16.0,32.0
Monaco,30.17571,30.715155,16.762474,27.490906,60.890865,44.25338,16.0,16.0,32.0


In [13]:
#MORE REPETITIVE NONSENSE

import numpy as np

# Create a new DataFrame for per match averages
xg_teams_per_match_averages = pd.DataFrame(index=teams_xg_xga.index)

# Calculate per match averages for home and away xG and xGA
xg_teams_per_match_averages['per_match_home_xg'] = teams_xg_xga['total_home_xg'] / teams_xg_xga['home_matches_played']
xg_teams_per_match_averages['per_match_away_xg'] = teams_xg_xga['total_away_xg'] / teams_xg_xga['away_matches_played']
xg_teams_per_match_averages['per_match_home_xga'] = teams_xg_xga['total_home_xga'] / teams_xg_xga['home_matches_played']
xg_teams_per_match_averages['per_match_away_xga'] = teams_xg_xga['total_away_xga'] / teams_xg_xga['away_matches_played']

# Calculate overall total xG and xGA per match
xg_teams_per_match_averages['total_xg_per_match'] = teams_xg_xga['total_xg'] / teams_xg_xga['matches_played']
xg_teams_per_match_averages['total_xga_per_match'] = teams_xg_xga['total_xga'] / teams_xg_xga['matches_played']

# Replace any potential infinite values with NaN (in case of division by zero) and then fill with 0
xg_teams_per_match_averages.replace([np.inf, -np.inf], np.nan, inplace=True)
xg_teams_per_match_averages.fillna(0, inplace=True)

# Calculate the average for each column
xg_average_stats = xg_teams_per_match_averages.mean()

# Append the 'Average' row with these averages to the DataFrame
xg_teams_per_match_averages.loc['Average'] = xg_average_stats

xg_teams_per_match_averages


# Display the new DataFrame to verify
print(games_list_df)

                       game_matchup
0                   Brest vs. Reims
1                 Nice vs. Le Havre
2            Clermont Foot vs. Lyon
3             Marseille vs. Lorient
4            Montpellier vs. Monaco
5                  Nantes vs. Lille
6  Paris Saint Germain vs. Toulouse
7                   Rennes vs. Lens
8               Strasbourg vs. Metz


In [14]:
#LITTLE BIT MORE REPETETITVE NONSENSE

import pandas as pd

# Assuming 'teams_per_match_averages' and 'games_list_df' are already defined

# Initialize an empty list for storing game metrics along with the matchup information
xg_games_metrics_list = []

# Retrieve the 'Average' row for league-wide average stats
xg_avg_stats = xg_teams_per_match_averages.loc['Average']

# Looping through each game to calculate metrics using 'teams_per_match_averages'
for index, row in games_list_df.iterrows():
    # Extract home and away team names
    home_team, away_team = row['game_matchup'].split(' vs. ')
    
    # Retrieve team stats from 'teams_per_match_averages'
    xg_home_team_stats = xg_teams_per_match_averages.loc[home_team]
    xg_away_team_stats = xg_teams_per_match_averages.loc[away_team]
    
    # Calculate the metrics using the per match averages
    xg_home_attack = xg_home_team_stats['per_match_home_xg'] / xg_avg_stats['per_match_home_xg']
    xg_away_defence = xg_away_team_stats['per_match_away_xga'] / xg_avg_stats['per_match_away_xga']
    xg_away_attack = xg_away_team_stats['per_match_away_xg'] / xg_avg_stats['per_match_away_xg']
    xg_home_defence = xg_home_team_stats['per_match_home_xga'] / xg_avg_stats['per_match_home_xga']
    
    # Append the calculated metrics for this game to the list along with the game_matchup
    xg_games_metrics_list.append({
        'Game Matchup': row['game_matchup'],
        'Home Attack': xg_home_attack, 
        'Away Defence': xg_away_defence,
        'Away Attack': xg_away_attack, 
        'Home Defence': xg_home_defence
    })

# Once all games are processed, create the DataFrame from the list
xg_games_metrics_df = pd.DataFrame(xg_games_metrics_list)
# Assuming 'games_metrics_df' already exists and contains the 'Game Matchup' column

# Splitting 'Game Matchup' into 'Home Team' and 'Away Team'
xg_games_metrics_df[['Home Team', 'Away Team']] = xg_games_metrics_df['Game Matchup'].str.split(' vs. ', expand=True)

# Merge home team averages
xg_games_metrics_df = xg_games_metrics_df.merge(
    teams_xg_xga_df[['Team', 'Average Home xG Last 5', 'Average Home xGA Last 5']],
    left_on='Home Team',
    right_on='Team',
    how='left'
).drop(columns=['Team'])  # Dropping the redundant 'Team' column after merge

# Merge away team averages
xg_games_metrics_df = xg_games_metrics_df.merge(
    teams_xg_xga_df[['Team', 'Average Away xG Last 5', 'Average Away xGA Last 5']],
    left_on='Away Team',
    right_on='Team',
    how='left',
    suffixes=('_home', '_away')
).drop(columns=['Team'])  # Dropping the redundant 'Team' column after merge


xg_games_metrics_df


Unnamed: 0,Game Matchup,Home Attack,Away Defence,Away Attack,Home Defence,Home Team,Away Team,Average Home xG Last 5,Average Home xGA Last 5,Average Away xG Last 5,Average Away xGA Last 5
0,Brest vs. Reims,1.116594,0.977099,1.111136,0.722915,Brest,Reims,1.69096,1.199274,1.557678,2.050426
1,Nice vs. Le Havre,1.040691,1.134391,0.663177,0.79453,Nice,Le Havre,1.873822,1.710605,0.924668,1.99004
2,Clermont Foot vs. Lyon,0.745612,1.00658,1.090901,1.374247,Clermont Foot,Lyon,1.350101,1.81997,1.976811,1.531529
3,Marseille vs. Lorient,1.212738,1.130497,0.71696,0.657121,Marseille,Lorient,1.904852,1.205141,1.093665,2.485038
4,Montpellier vs. Monaco,1.099521,1.033382,1.499361,1.050526,Montpellier,Monaco,2.097512,1.648906,1.974794,1.635249
5,Nantes vs. Lille,0.758084,0.847353,0.862701,1.519437,Nantes,Lille,1.014146,2.252502,0.938437,1.48337
6,Paris Saint Germain vs. Toulouse,1.233778,1.073698,0.933052,0.652562,Paris Saint Germain,Toulouse,2.269112,0.932542,1.422354,1.215606
7,Rennes vs. Lens,1.230717,0.952589,1.096446,1.032729,Rennes,Lens,2.114336,1.282472,1.742995,1.470353
8,Strasbourg vs. Metz,0.868499,1.305026,0.825227,1.115097,Strasbourg,Metz,1.244087,1.327011,1.599972,2.151079


In [15]:
#BUILDING HOME AND AWAY ATTACK "SCORES" FOR XG NOW USING BOTH SEASONAL AND LAST 5 AVERAGES IN WEIGHTING

import pandas as pd

# Assuming 'teams_per_match_averages', 'games_list_df', and 'teams_xg_xga_df' are already defined

# Retrieve season-long averages
season_avg_home_xg = xg_teams_per_match_averages['per_match_home_xg'].mean()
season_avg_away_xg = xg_teams_per_match_averages['per_match_away_xg'].mean()
season_avg_home_xga = xg_teams_per_match_averages['per_match_home_xga'].mean()
season_avg_away_xga = xg_teams_per_match_averages['per_match_away_xga'].mean()

# Initialize an empty list for storing game metrics along with the matchup information
xg_games_metrics_list = []

# Loop through each game to calculate metrics using 'teams_per_match_averages'
for index, row in games_list_df.iterrows():
    home_team, away_team = row['game_matchup'].split(' vs. ')
    
    # Retrieve team stats from 'teams_per_match_averages'
    xg_home_team_stats = xg_teams_per_match_averages.loc[home_team]
    xg_away_team_stats = xg_teams_per_match_averages.loc[away_team]
    
    # Retrieve recent form from 'teams_xg_xga_df'
    home_recent_xg = teams_xg_xga_df.loc[teams_xg_xga_df['Team'] == home_team, 'Average Home xG Last 5'].values[0]
    away_recent_xg = teams_xg_xga_df.loc[teams_xg_xga_df['Team'] == away_team, 'Average Away xG Last 5'].values[0]
    home_recent_xga = teams_xg_xga_df.loc[teams_xg_xga_df['Team'] == home_team, 'Average Home xGA Last 5'].values[0]
    away_recent_xga = teams_xg_xga_df.loc[teams_xg_xga_df['Team'] == away_team, 'Average Away xGA Last 5'].values[0]
    
    # Compare recent form to season-long averages to get adjustment factors
    xg_home_attack_factor = home_recent_xg / xg_home_team_stats['per_match_home_xg'] if xg_home_team_stats['per_match_home_xg'] != 0 else 1
    xg_away_defence_factor = away_recent_xga / xg_away_team_stats['per_match_away_xga'] if xg_away_team_stats['per_match_away_xga'] != 0 else 1
    xg_away_attack_factor = away_recent_xg / xg_away_team_stats['per_match_away_xg'] if xg_away_team_stats['per_match_away_xg'] != 0 else 1
    xg_home_defence_factor = home_recent_xga / xg_home_team_stats['per_match_home_xga'] if xg_home_team_stats['per_match_home_xga'] != 0 else 1
    
    # Adjust metrics using the factors
    xg_home_attack_adjusted = xg_home_team_stats['per_match_home_xg'] * xg_home_attack_factor / season_avg_home_xg
    xg_away_defence_adjusted = xg_away_team_stats['per_match_away_xga'] * xg_away_defence_factor / season_avg_away_xga
    xg_away_attack_adjusted = xg_away_team_stats['per_match_away_xg'] * xg_away_attack_factor / season_avg_away_xg
    xg_home_defence_adjusted = xg_home_team_stats['per_match_home_xga'] * xg_home_defence_factor / season_avg_home_xga
    
    # Append the adjusted metrics for this game to the list
    xg_games_metrics_list.append({
        'Game Matchup': row['game_matchup'],
        'XG Home Attack': xg_home_attack_adjusted,
        'XG Away Defence': xg_away_defence_adjusted,
        'XG Away Attack': xg_away_attack_adjusted,
        'XG Home Defence': xg_home_defence_adjusted
    })

# Create the DataFrame from the list
xg_games_metrics_df = pd.DataFrame(xg_games_metrics_list)

# Split 'Game Matchup' into 'Home Team' and 'Away Team'
xg_games_metrics_df[['Home Team', 'Away Team']] = xg_games_metrics_df['Game Matchup'].str.split(' vs. ', expand=True)

xg_games_metrics_df


Unnamed: 0,Game Matchup,XG Home Attack,XG Away Defence,XG Away Attack,XG Home Defence,Home Team,Away Team
0,Brest vs. Reims,1.016862,1.233207,1.216609,0.939927,Brest,Reims
1,Nice vs. Le Havre,1.126826,1.196888,0.722203,1.340681,Nice,Le Havre
2,Clermont Foot vs. Lyon,0.811885,0.921122,1.543969,1.426395,Clermont Foot,Lyon
3,Marseille vs. Lorient,1.145486,1.494599,0.854196,0.944525,Marseille,Lorient
4,Montpellier vs. Monaco,1.261342,0.983503,1.542394,1.292325,Montpellier,Monaco
5,Nantes vs. Lille,0.609858,0.892157,0.732957,1.765391,Nantes,Lille
6,Paris Saint Germain vs. Toulouse,1.364534,0.731113,1.110916,0.730877,Paris Saint Germain,Toulouse
7,Rennes vs. Lens,1.271459,0.884328,1.361349,1.005133,Rennes,Lens
8,Strasbourg vs. Metz,0.748134,1.293744,1.249643,1.040041,Strasbourg,Metz


In [16]:
#GENERATE XG PROJECTED LIST

# Initialize an empty list for storing projected goals data
projected_xg_list = []

# Retrieve the league-wide average per match home and away xG
league_avg_per_match_home_xg = xg_teams_per_match_averages.loc['Average', 'per_match_home_xg']
league_avg_per_match_away_xg = xg_teams_per_match_averages.loc['Average', 'per_match_away_xg']

# Iterate over each row in the games_metrics_df to calculate projected goals
for index, row in xg_games_metrics_df.iterrows():
    # Extract home and away team names from the game matchup
    home_team, away_team = row['Game Matchup'].split(' vs. ')

    # Calculate metrics using the teams' attack and defence strengths
    xg_home_attack_adjusted = row['XG Home Attack']
    xg_away_defence_adjusted = row['XG Away Defence']
    xg_away_attack_adjusted = row['XG Away Attack']
    xg_home_defence_adjusted = row['XG Home Defence']

    # Calculate projected home and away goals using the league-wide average per game xG
    projected_home_xg = xg_home_attack_adjusted * xg_away_defence_adjusted * league_avg_per_match_home_xg
    projected_away_xg = xg_away_attack_adjusted * xg_home_defence_adjusted * league_avg_per_match_away_xg

    # Calculate total projected goals
    total_projected_xg = projected_home_xg + projected_away_xg

    # Append the results to the list
    projected_xg_list.append({
        'Game Matchup': row['Game Matchup'],
        'Projected Home Goals': projected_home_xg,
        'Projected Away Goals': projected_away_xg,
        'Total Projected Goals': total_projected_xg
    })

# Convert the list to a DataFrame
projected_xg_df = pd.DataFrame(projected_xg_list)

# Display the DataFrame
projected_xg_df


Unnamed: 0,Game Matchup,Projected Home Goals,Projected Away Goals,Total Projected Goals
0,Brest vs. Reims,2.085303,1.464104,3.549407
1,Nice vs. Le Havre,2.242755,1.239685,3.48244
2,Clermont Foot vs. Lyon,1.243607,2.819713,4.06332
3,Marseille vs. Lorient,2.846991,1.032994,3.879985
4,Montpellier vs. Monaco,2.062909,2.552075,4.614984
5,Nantes vs. Lille,0.904777,1.656708,2.561485
6,Paris Saint Germain vs. Toulouse,1.658978,1.039566,2.698543
7,Rennes vs. Lens,1.869766,1.751942,3.621708
8,Strasbourg vs. Metz,1.60953,1.664036,3.273566


In [17]:
#GENERATING PROJECTED GOALS LIST NOW USING BOTH SETS OF STATS, 70% WEIGHT ON XG STATS AND 30% ON GOALS

# Weighting factors
weight_goals = 0.3
weight_xg = 0.7

# Extension to include separate projections for goals and xG

# Initialize additional lists for storing individual projected goals data
combined_projected_goals_list = []
projected_goals_list = []
projected_xg_list = []

# Iterate over each row in the games_metrics_df to calculate individual and combined projected goals
for index, row in games_metrics_df.iterrows():
    game_matchup = row['Game Matchup']
    
    # Find the corresponding xG metrics for the same game matchup
    xg_row = xg_games_metrics_df[xg_games_metrics_df['Game Matchup'] == game_matchup].iloc[0]
    
    # Individual projections
    projected_goals_home = row['Home Attack'] * row['Away Defence'] * league_avg_per_match_home_goals
    projected_goals_away = row['Away Attack'] * row['Home Defence'] * league_avg_per_match_away_goals
    projected_xg_home = xg_row['XG Home Attack'] * xg_row['XG Away Defence'] * league_avg_per_match_home_goals
    projected_xg_away = xg_row['XG Away Attack'] * xg_row['XG Home Defence'] * league_avg_per_match_away_goals
    
    # Combine metrics using the weights for both goals and xG
    combined_home_attack = (row['Home Attack'] * weight_goals) + (xg_row['XG Home Attack'] * weight_xg)
    combined_away_defence = (row['Away Defence'] * weight_goals) + (xg_row['XG Away Defence'] * weight_xg)
    combined_away_attack = (row['Away Attack'] * weight_goals) + (xg_row['XG Away Attack'] * weight_xg)
    combined_home_defence = (row['Home Defence'] * weight_goals) + (xg_row['XG Home Defence'] * weight_xg)
    
    # Calculate combined projected home and away goals
    combined_projected_home_goals = combined_home_attack * combined_away_defence * league_avg_per_match_home_goals
    combined_projected_away_goals = combined_away_attack * combined_home_defence * league_avg_per_match_away_goals

    # Calculate total projected goals for individual and combined projections
    total_projected_goals_combined = combined_projected_home_goals + combined_projected_away_goals
    total_projected_goals = projected_goals_home + projected_goals_away
    total_projected_xg = projected_xg_home + projected_xg_away

    # Append the results to the lists
    projected_goals_list.append({'Game Matchup': game_matchup, 'Projected Goals Home': projected_goals_home, 'Projected Goals Away': projected_goals_away, 'Total Projected Goals': total_projected_goals})
    projected_xg_list.append({'Game Matchup': game_matchup, 'Projected XG Home': projected_xg_home, 'Projected XG Away': projected_xg_away, 'Total Projected XG': total_projected_xg})
    combined_projected_goals_list.append({'Game Matchup': game_matchup, 'Combined Projected Home Goals': combined_projected_home_goals, 'Combined Projected Away Goals': combined_projected_away_goals, 'Combined Total Projected Goals': total_projected_goals_combined})

# Convert the lists to DataFrames
projected_goals_df = pd.DataFrame(projected_goals_list)
projected_xg_df = pd.DataFrame(projected_xg_list)
combined_projected_goals_df = pd.DataFrame(combined_projected_goals_list)



In [18]:
projected_goals_df

Unnamed: 0,Game Matchup,Projected Goals Home,Projected Goals Away,Total Projected Goals
0,Brest vs. Reims,1.830448,1.3523,3.182748
1,Nice vs. Le Havre,1.941384,1.126916,3.068301
2,Clermont Foot vs. Lyon,2.218725,4.604258,6.822983
3,Marseille vs. Lorient,2.773406,0.772743,3.546149
4,Montpellier vs. Monaco,1.941384,3.76712,5.708505
5,Nantes vs. Lille,0.332809,2.092845,2.425653
6,Paris Saint Germain vs. Toulouse,1.220299,2.318228,3.538527
7,Rennes vs. Lens,2.135523,2.253833,4.389356
8,Strasbourg vs. Metz,1.497639,1.803066,3.300706


In [19]:
projected_xg_df

Unnamed: 0,Game Matchup,Projected XG Home,Projected XG Away,Total Projected XG
0,Brest vs. Reims,1.808559,1.427585,3.236143
1,Nice vs. Le Havre,1.945115,1.208763,3.153878
2,Clermont Foot vs. Lyon,1.078565,2.749381,3.827946
3,Marseille vs. Lorient,2.469161,1.007228,3.476389
4,Montpellier vs. Monaco,1.789136,2.488419,4.277555
5,Nantes vs. Lille,0.784702,1.615385,2.400087
6,Paris Saint Germain vs. Toulouse,1.438811,1.013636,2.452447
7,Rennes vs. Lens,1.621626,1.708244,3.329869
8,Strasbourg vs. Metz,1.395926,1.62253,3.018456


In [20]:
combined_projected_goals_df

Unnamed: 0,Game Matchup,Combined Projected Home Goals,Combined Projected Away Goals,Combined Total Projected Goals
0,Brest vs. Reims,1.831469,1.405653,3.237122
1,Nice vs. Le Havre,1.952962,1.188616,3.141578
2,Clermont Foot vs. Lyon,1.37866,3.257187,4.635846
3,Marseille vs. Lorient,2.568328,0.938089,3.506417
4,Montpellier vs. Monaco,1.8353,2.849835,4.685135
5,Nantes vs. Lille,0.643073,1.752785,2.395857
6,Paris Saint Germain vs. Toulouse,1.381862,1.356703,2.738565
7,Rennes vs. Lens,1.769154,1.910001,3.679156
8,Strasbourg vs. Metz,1.427605,1.685041,3.112645


In [21]:
#SIMULATE RESULTS 1000 TIMES TO GENERATE WIN PROBABILITY PERCENTAGES

import numpy as np

# Number of simulations to run for each game
n_simulations = 10000

# Initialize lists to store the win/draw probabilities
home_win_probs = []
away_win_probs = []
draw_probs = []

for index, row in combined_projected_goals_df.iterrows():
    home_goals_proj = row['Combined Projected Home Goals']
    away_goals_proj = row['Combined Projected Away Goals']

    # Simulate match outcomes
    home_goals_sim = np.random.poisson(home_goals_proj, n_simulations)
    away_goals_sim = np.random.poisson(away_goals_proj, n_simulations)

    # Calculate outcomes
    home_wins = np.sum(home_goals_sim > away_goals_sim)
    away_wins = np.sum(away_goals_sim > home_goals_sim)
    draws = np.sum(home_goals_sim == away_goals_sim)

    # Calculate probabilities
    home_win_prob = home_wins / n_simulations
    away_win_prob = away_wins / n_simulations
    draw_prob = draws / n_simulations

    # Append probabilities to lists
    home_win_probs.append(home_win_prob)
    away_win_probs.append(away_win_prob)
    draw_probs.append(draw_prob)

# Add the calculated probabilities to the projected_goals_df DataFrame
combined_projected_goals_df['Home Win Probability'] = home_win_probs
combined_projected_goals_df['Away Win Probability'] = away_win_probs
combined_projected_goals_df['Draw Probability'] = draw_probs

# Convert probabilities to percentage format and append '%' sign
combined_projected_goals_df['Home Win Probability'] = (combined_projected_goals_df['Home Win Probability'] * 100).apply(lambda x: f'{x:.2f}%')
combined_projected_goals_df['Away Win Probability'] = (combined_projected_goals_df['Away Win Probability'] * 100).apply(lambda x: f'{x:.2f}%')
combined_projected_goals_df['Draw Probability'] = (combined_projected_goals_df['Draw Probability'] * 100).apply(lambda x: f'{x:.2f}%')

# Display the updated DataFrame with formatted probabilities
combined_projected_goals_df



Unnamed: 0,Game Matchup,Combined Projected Home Goals,Combined Projected Away Goals,Combined Total Projected Goals,Home Win Probability,Away Win Probability,Draw Probability
0,Brest vs. Reims,1.831469,1.405653,3.237122,47.78%,29.31%,22.91%
1,Nice vs. Le Havre,1.952962,1.188616,3.141578,55.50%,23.05%,21.45%
2,Clermont Foot vs. Lyon,1.37866,3.257187,4.635846,12.42%,73.99%,13.59%
3,Marseille vs. Lorient,2.568328,0.938089,3.506417,72.96%,11.21%,15.83%
4,Montpellier vs. Monaco,1.8353,2.849835,4.685135,23.08%,60.00%,16.92%
5,Nantes vs. Lille,0.643073,1.752785,2.395857,13.02%,64.79%,22.19%
6,Paris Saint Germain vs. Toulouse,1.381862,1.356703,2.738565,37.90%,35.77%,26.33%
7,Rennes vs. Lens,1.769154,1.910001,3.679156,36.13%,42.14%,21.73%
8,Strasbourg vs. Metz,1.427605,1.685041,3.112645,32.81%,43.60%,23.59%


In [22]:
#LOADING IN ODDS DATA FOR LIST OF GAMES IN GAMES LIST DF

import requests
# An api key is emailed to you when you sign up to a plan
# Get a free API key at https://api.the-odds-api.com/
API_KEY = 'fbfe4cd6eb74292626a2d30469872e96'

SPORT = 'soccer' # use the sport_key from the /sports endpoint below, or use 'upcoming' to see the next 8 games across all sports

REGIONS = 'us' # uk | us | eu | au. Multiple can be specified if comma delimited

MARKETS = 'h2h,spreads' # h2h | spreads | totals. Multiple can be specified if comma delimited

ODDS_FORMAT = 'decimal' # decimal | american

DATE_FORMAT = 'iso' # iso | unix

import requests

# Your API key for the odds API
api_key = 'fbfe4cd6eb74292626a2d30469872e96'

# Define the endpoint URL
url = 'https://api.the-odds-api.com/v4/sports/soccer_france_ligue_one/odds/'

# Parameters for the API request
params = {
    'apiKey': api_key,
    'regions': 'us',  # Adjust if targeting a different region
    'markets': 'h2h',  # Head-to-head odds; adjust if looking for different market types
    'bookmakers': 'bovada',
    'oddsFormat': 'decimal'
}

# Make the GET request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Convert the response to JSON
    odds_data = response.json()
    # Process your data here
    print(odds_data)
else:
    print(f"Error fetching data: {response.status_code}")

# Initialize an empty list to hold the simplified odds data
simplified_odds_list = []

# Iterate over the odds data
for event in odds_data:
    # Extract the essential information
    event_name = event.get('sport_title', 'No sport title')
    home_team = event.get('home_team', 'No home team')
    away_team = event.get('away_team', 'No away team')
    start_time = event.get('commence_time', 'No start time')
    
    # Initialize default odds
    home_team_odds = 'N/A'
    away_team_odds = 'N/A'
    draw_odds = 'N/A'  # Initialize draw odds

    # If there are bookmakers data available
    if event.get('bookmakers'):
        # Look for the Bovada bookmaker in the list
        for bookmaker in event['bookmakers']:
            if bookmaker['key'].lower() == 'bovada':  # Ensure case-insensitive matching
                # Assuming 'markets' is a list and we're interested in the first item (h2h)
                markets = bookmaker.get('markets', [])
                if markets:
                    # Assuming the structure of 'outcomes' in the first market
                    outcomes = markets[0].get('outcomes', [])
                    for outcome in outcomes:
                        # Assign odds based on the team or if it's a draw
                        if outcome['name'] == home_team:
                            home_team_odds = outcome.get('price', 'N/A')
                        elif outcome['name'] == away_team:
                            away_team_odds = outcome.get('price', 'N/A')
                        elif outcome['name'].lower() == 'draw':  # Checking for a draw
                            draw_odds = outcome.get('price', 'N/A')
                
                # Once we've found Bovada and extracted the odds, no need to check other bookmakers
                break

    # Create a simplified representation of the event including draw odds
    simplified_event = f"{event_name}: {home_team} vs. {away_team}, Date: {start_time}, Home Odds: {home_team_odds}, Away Odds: {away_team_odds}, Draw Odds: {draw_odds}"
    
    # Add to the list
    simplified_odds_list.append(simplified_event)

# Now, 'simplified_odds_list' contains a more readable version of the odds data, including draw odds
for item in simplified_odds_list:
    print(item)

odds_data_list = []

for event in odds_data:
    home_team = event.get('home_team')
    away_team = event.get('away_team')
    start_time = event.get('commence_time')  # Assuming datetime format or additional parsing might be needed
    
    # Initialize odds
    home_odds = 'N/A'
    away_odds = 'N/A'
    draw_odds = 'N/A'  # Initialize draw odds variable

    # Iterate through bookmakers to find Bovada and extract the odds
    for bookmaker in event.get('bookmakers', []):
        if bookmaker['key'].lower() == 'bovada':  # Ensure case-insensitive matching
            markets = bookmaker.get('markets', [])
            if markets:
                outcomes = markets[0].get('outcomes', [])
                for outcome in outcomes:
                    # Check and assign odds based on the team or if it's a draw
                    if outcome['name'] == home_team:
                        home_odds = outcome.get('price', 'N/A')
                    elif outcome['name'] == away_team:
                        away_odds = outcome.get('price', 'N/A')
                    elif outcome['name'].lower() == 'draw':  # Check for draw odds
                        draw_odds = outcome.get('price', 'N/A')

    # Append the data including draw odds to the list
    odds_data_list.append({
        'Game Matchup': f"{home_team} vs. {away_team}",
        'Home Odds': home_odds,
        'Away Odds': away_odds,
        'Draw Odds': draw_odds,  # Include draw odds
        'Start Time': start_time
    })

# Convert the list to a DataFrame
odds_df = pd.DataFrame(odds_data_list)
name_replacements = {
    'RC Lens': 'Lens',
    'AS Monaco': 'Monaco',
    'SC Freiburg': 'Freiburg',
    'Stade de Reims': 'Reims',
    'Clermont': 'Clermont Foot'
}
import pandas as pd

# Assuming odds_df is already defined

# Apply replacements in the 'Game Matchup' column
for original_name, new_name in name_replacements.items():
    odds_df['Game Matchup'] = odds_df['Game Matchup'].str.replace(original_name, new_name, regex=False)

# Assuming 'projected_goals_df' exists and has a 'Game Matchup' column
merged_df = pd.merge(combined_projected_goals_df, odds_df, on='Game Matchup', how='left')

# Now 'merged_df' contains both the projected probabilities and the odds
merged_df




[{'id': '22603591f09094d288c12d179d51677a', 'sport_key': 'soccer_france_ligue_one', 'sport_title': 'Ligue 1 - France', 'commence_time': '2024-05-10T19:00:00Z', 'home_team': 'Brest', 'away_team': 'Stade de Reims', 'bookmakers': [{'key': 'bovada', 'title': 'Bovada', 'last_update': '2024-05-09T07:55:14Z', 'markets': [{'key': 'h2h', 'last_update': '2024-05-09T07:55:14Z', 'outcomes': [{'name': 'Brest', 'price': 1.69}, {'name': 'Stade de Reims', 'price': 4.7}, {'name': 'Draw', 'price': 4.1}]}]}]}, {'id': '6628de4abd8f55adeed78e8c2170b68f', 'sport_key': 'soccer_france_ligue_one', 'sport_title': 'Ligue 1 - France', 'commence_time': '2024-05-10T19:00:00Z', 'home_team': 'Nice', 'away_team': 'Le Havre', 'bookmakers': [{'key': 'bovada', 'title': 'Bovada', 'last_update': '2024-05-09T07:55:14Z', 'markets': [{'key': 'h2h', 'last_update': '2024-05-09T07:55:14Z', 'outcomes': [{'name': 'Le Havre', 'price': 6.0}, {'name': 'Nice', 'price': 1.53}, {'name': 'Draw', 'price': 4.2}]}]}]}, {'id': '0d213e9fda164

Unnamed: 0,Game Matchup,Combined Projected Home Goals,Combined Projected Away Goals,Combined Total Projected Goals,Home Win Probability,Away Win Probability,Draw Probability,Home Odds,Away Odds,Draw Odds,Start Time
0,Brest vs. Reims,1.831469,1.405653,3.237122,47.78%,29.31%,22.91%,1.69,4.7,4.1,2024-05-10T19:00:00Z
1,Nice vs. Le Havre,1.952962,1.188616,3.141578,55.50%,23.05%,21.45%,1.53,6.0,4.2,2024-05-10T19:00:00Z
2,Clermont Foot vs. Lyon,1.37866,3.257187,4.635846,12.42%,73.99%,13.59%,5.2,1.59,4.5,2024-05-12T19:00:00Z
3,Marseille vs. Lorient,2.568328,0.938089,3.506417,72.96%,11.21%,15.83%,1.4,7.0,5.2,2024-05-12T19:00:00Z
4,Montpellier vs. Monaco,1.8353,2.849835,4.685135,23.08%,60.00%,16.92%,3.7,1.83,4.2,2024-05-12T19:00:00Z
5,Nantes vs. Lille,0.643073,1.752785,2.395857,13.02%,64.79%,22.19%,3.9,1.95,3.55,2024-05-12T19:00:00Z
6,Paris Saint Germain vs. Toulouse,1.381862,1.356703,2.738565,37.90%,35.77%,26.33%,1.41,6.0,5.6,2024-05-12T19:00:00Z
7,Rennes vs. Lens,1.769154,1.910001,3.679156,36.13%,42.14%,21.73%,2.5,2.75,3.45,2024-05-12T19:00:00Z
8,Strasbourg vs. Metz,1.427605,1.685041,3.112645,32.81%,43.60%,23.59%,2.2,3.15,3.65,2024-05-12T19:00:00Z


In [23]:
#CREATING IMPLIED ODDS BASED OFF THE WIN PROBABILITY

# Convert percentage strings to decimal probabilities
merged_df['Home Win Probability'] = merged_df['Home Win Probability'].str.rstrip('%').astype(float) / 100
merged_df['Away Win Probability'] = merged_df['Away Win Probability'].str.rstrip('%').astype(float) / 100
merged_df['Draw Probability'] = merged_df['Draw Probability'].str.rstrip('%').astype(float) / 100
# Calculate implied odds
merged_df['Implied Home Odds'] = 1 / merged_df['Home Win Probability']
merged_df['Implied Away Odds'] = 1 / merged_df['Away Win Probability']
merged_df['Implied Draw Odds'] = 1 / merged_df['Draw Probability']
# Create a new DataFrame with implied odds
implied_odds_df = merged_df[['Game Matchup', 'Implied Home Odds', 'Implied Away Odds', 'Implied Draw Odds', 'Home Odds', 'Away Odds', 'Draw Odds']].copy()


In [29]:
# Ensure the data types are float for the calculations
columns_to_convert = ['Implied Home Odds', 'Home Odds', 'Implied Away Odds', 'Away Odds', 'Implied Draw Odds', 'Draw Odds']
implied_odds_df[columns_to_convert] = implied_odds_df[columns_to_convert].astype(float)

# Calculate edge for home win
implied_odds_df['Home Win Edge'] = round((1 / implied_odds_df['Implied Home Odds']) * implied_odds_df['Home Odds'] - 1, 2)

# Calculate edge for away win
implied_odds_df['Away Win Edge'] = round((1 / implied_odds_df['Implied Away Odds']) * implied_odds_df['Away Odds'] - 1, 2)

# Calculate edge for draw
implied_odds_df['Draw Edge'] = round((1 / implied_odds_df['Implied Draw Odds']) * implied_odds_df['Draw Odds'] - 1, 2)

# Round implied odds to 2 decimal points
implied_odds_df['Implied Home Odds'] = round(implied_odds_df['Implied Home Odds'], 2)
implied_odds_df['Implied Away Odds'] = round(implied_odds_df['Implied Away Odds'], 2)
implied_odds_df['Implied Draw Odds'] = round(implied_odds_df['Implied Draw Odds'], 2)


In [30]:
#MERGING ODDS DATA BACK WITH ORIGINAL DATAFRAME FOR PROJECTIONS

# Merge the implied odds DataFrame with the projected goals DataFrame on the 'Game Matchup' column
implied_odds_with_goals = implied_odds_df.merge(combined_projected_goals_df, on='Game Matchup', how='left')

# Display the DataFrame with projected goals added
implied_odds_with_goals


Unnamed: 0,Game Matchup,Implied Home Odds,Implied Away Odds,Implied Draw Odds,Home Odds,Away Odds,Draw Odds,Home Win Edge,Away Win Edge,Draw Edge,Combined Projected Home Goals,Combined Projected Away Goals,Combined Total Projected Goals,Home Win Probability,Away Win Probability,Draw Probability
0,Brest vs. Reims,2.09,3.41,4.36,1.69,4.7,4.1,-0.19,0.38,-0.06,1.831469,1.405653,3.237122,47.78%,29.31%,22.91%
1,Nice vs. Le Havre,1.8,4.34,4.66,1.53,6.0,4.2,-0.15,0.38,-0.1,1.952962,1.188616,3.141578,55.50%,23.05%,21.45%
2,Clermont Foot vs. Lyon,8.05,1.35,7.36,5.2,1.59,4.5,-0.35,0.18,-0.39,1.37866,3.257187,4.635846,12.42%,73.99%,13.59%
3,Marseille vs. Lorient,1.37,8.92,6.32,1.4,7.0,5.2,0.02,-0.22,-0.18,2.568328,0.938089,3.506417,72.96%,11.21%,15.83%
4,Montpellier vs. Monaco,4.33,1.67,5.91,3.7,1.83,4.2,-0.15,0.1,-0.29,1.8353,2.849835,4.685135,23.08%,60.00%,16.92%
5,Nantes vs. Lille,7.68,1.54,4.51,3.9,1.95,3.55,-0.49,0.26,-0.21,0.643073,1.752785,2.395857,13.02%,64.79%,22.19%
6,Paris Saint Germain vs. Toulouse,2.64,2.8,3.8,1.41,6.0,5.6,-0.47,1.15,0.47,1.381862,1.356703,2.738565,37.90%,35.77%,26.33%
7,Rennes vs. Lens,2.77,2.37,4.6,2.5,2.75,3.45,-0.1,0.16,-0.25,1.769154,1.910001,3.679156,36.13%,42.14%,21.73%
8,Strasbourg vs. Metz,3.05,2.29,4.24,2.2,3.15,3.65,-0.28,0.37,-0.14,1.427605,1.685041,3.112645,32.81%,43.60%,23.59%


In [31]:
#MAKING THE ODDS AMERICAN FORMAT

def decimal_to_american(decimal_odds):
    """
    Convert decimal odds to American odds, adding a '+' sign for positive odds.
    
    Parameters:
    - decimal_odds: float representing the decimal odds.
    
    Returns:
    - str: The American odds as a string, with '+' for positive odds.
    """
    if decimal_odds >= 2.00:
        american_odds = int((decimal_odds - 1) * 100)
    else:
        american_odds = int(-100 / (decimal_odds - 1))
    
    # Add '+' for positive odds
    if american_odds > 0:
        return f'+{american_odds}'
    else:
        return str(american_odds)

# Example usage on a DataFrame column
implied_odds_with_goals['Implied Home Odds American'] = implied_odds_with_goals['Implied Home Odds'].apply(decimal_to_american)
implied_odds_with_goals['Implied Away Odds American'] = implied_odds_with_goals['Implied Away Odds'].apply(decimal_to_american)
implied_odds_with_goals['Implied Draw Odds American'] = implied_odds_with_goals['Implied Draw Odds'].apply(decimal_to_american)
implied_odds_with_goals['Draw Odds American'] = implied_odds_with_goals['Draw Odds'].apply(decimal_to_american)
implied_odds_with_goals['Home Odds American'] = implied_odds_with_goals['Home Odds'].apply(decimal_to_american)
implied_odds_with_goals['Away Odds American'] = implied_odds_with_goals['Away Odds'].apply(decimal_to_american)
# Optionally, display the DataFrame to verify the conversion
implied_odds_with_goals[['Implied Home Odds', 'Implied Home Odds American', 
        'Implied Away Odds', 'Implied Away Odds American', 
        'Implied Draw Odds', 'Implied Draw Odds American']]


Unnamed: 0,Implied Home Odds,Implied Home Odds American,Implied Away Odds,Implied Away Odds American,Implied Draw Odds,Implied Draw Odds American
0,2.09,108,3.41,241,4.36,336
1,1.8,-125,4.34,334,4.66,366
2,8.05,705,1.35,-285,7.36,636
3,1.37,-270,8.92,792,6.32,532
4,4.33,333,1.67,-149,5.91,491
5,7.68,668,1.54,-185,4.51,351
6,2.64,164,2.8,179,3.8,280
7,2.77,177,2.37,137,4.6,359
8,3.05,204,2.29,129,4.24,324


In [32]:
#SETTING UP AND CLEANING NEW DF

# Create a new DataFrame by copying the original DataFrame
new_df = implied_odds_with_goals.copy()

# Replace decimal odds columns with their American odds counterparts
new_df['Implied Home Odds'] = new_df['Implied Home Odds American']
new_df['Implied Away Odds'] = new_df['Implied Away Odds American']
new_df['Implied Draw Odds'] = new_df['Implied Draw Odds American']
new_df['Draw Odds'] = new_df['Draw Odds American']
new_df['Home Odds'] = new_df['Home Odds American']
new_df['Away Odds'] = new_df['Away Odds American']

# Drop the American odds columns as they've been swapped in, if desired
new_df.drop(columns=['Implied Home Odds American', 'Implied Away Odds American', 
                     'Implied Draw Odds American', 'Draw Odds American', 
                     'Home Odds American', 'Away Odds American'], inplace=True)

# Optionally, rename columns back to original if preferred, for clarity
# This step is optional and may be desired for presentation or further processing
column_renames = {
    'Implied Home Odds': 'Implied Home Odds',
    'Implied Away Odds': 'Implied Away Odds',
    'Implied Draw Odds': 'Implied Draw Odds',
    'Home Odds': 'Home Odds',
    'Away Odds': 'Away Odds',
    'Draw Odds': 'Draw Odds',
}
new_df.rename(columns=column_renames, inplace=True)

# Define the new column order
new_column_order = [
    'Game Matchup',
    'Combined Projected Home Goals',
    'Combined Projected Away Goals',
    'Combined Total Projected Goals',
    'Home Win Probability',
    'Away Win Probability',
    'Draw Probability',
    'Implied Home Odds',
    'Implied Away Odds',
    'Implied Draw Odds',
    'Home Odds',
    'Away Odds',
    'Draw Odds',
    'Home Win Edge',
    'Away Win Edge',
    'Draw Edge'
]

# Reorder the DataFrame according to the new column order
new_df = new_df[new_column_order]

# Round the "Combined Total Projected Goals" column to two decimal places
new_df['Combined Total Projected Goals'] = new_df['Combined Total Projected Goals'].round(2)
new_df['Combined Projected Home Goals'] = new_df['Combined Projected Home Goals'].round(2)
new_df['Combined Projected Away Goals'] = new_df['Combined Projected Away Goals'].round(2)
new_df['League'] = 'Ligue Une'

# Display the DataFrame to verify the changes
print(new_df)


# Now new_df contains the American odds in place of the decimal odds,
# with all other columns unchanged.


                       Game Matchup  Combined Projected Home Goals  \
0                   Brest vs. Reims                           1.83   
1                 Nice vs. Le Havre                           1.95   
2            Clermont Foot vs. Lyon                           1.38   
3             Marseille vs. Lorient                           2.57   
4            Montpellier vs. Monaco                           1.84   
5                  Nantes vs. Lille                           0.64   
6  Paris Saint Germain vs. Toulouse                           1.38   
7                   Rennes vs. Lens                           1.77   
8               Strasbourg vs. Metz                           1.43   

   Combined Projected Away Goals  Combined Total Projected Goals  \
0                           1.41                            3.24   
1                           1.19                            3.14   
2                           3.26                            4.64   
3                          

In [33]:
# Save the DataFrame to a CSV file
new_df.to_csv('LIGUE_UNE_odds_5-11.csv', index=False)
