### EPL 24/25 Model Input set-up ###

In [442]:
import pandas as pd

# Load the new odds data
odds_data = pd.read_csv('epl_average_odds.csv')

# Display the first few rows of the data
odds_data


Unnamed: 0,match,outcome,odds
0,Arsenal vs Wolverhampton Wanderers,Arsenal,1.172222
1,Arsenal vs Wolverhampton Wanderers,Draw,7.716667
2,Arsenal vs Wolverhampton Wanderers,Wolverhampton Wanderers,15.055556
3,Brentford vs Crystal Palace,Brentford,2.425556
4,Brentford vs Crystal Palace,Crystal Palace,2.914444
5,Brentford vs Crystal Palace,Draw,3.4
6,Chelsea vs Manchester City,Chelsea,3.944444
7,Chelsea vs Manchester City,Draw,3.922222
8,Chelsea vs Manchester City,Manchester City,1.857778
9,Everton vs Brighton and Hove Albion,Brighton and Hove Albion,2.6575


In [443]:
odds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   match    30 non-null     object 
 1   outcome  30 non-null     object 
 2   odds     30 non-null     float64
dtypes: float64(1), object(2)
memory usage: 848.0+ bytes


In [444]:
import pandas as pd

# Step 1: Extract home and away teams
odds_data[['home_team', 'away_team']] = odds_data['match'].str.split(' vs ', expand=True)



In [445]:
odds_data

Unnamed: 0,match,outcome,odds,home_team,away_team
0,Arsenal vs Wolverhampton Wanderers,Arsenal,1.172222,Arsenal,Wolverhampton Wanderers
1,Arsenal vs Wolverhampton Wanderers,Draw,7.716667,Arsenal,Wolverhampton Wanderers
2,Arsenal vs Wolverhampton Wanderers,Wolverhampton Wanderers,15.055556,Arsenal,Wolverhampton Wanderers
3,Brentford vs Crystal Palace,Brentford,2.425556,Brentford,Crystal Palace
4,Brentford vs Crystal Palace,Crystal Palace,2.914444,Brentford,Crystal Palace
5,Brentford vs Crystal Palace,Draw,3.4,Brentford,Crystal Palace
6,Chelsea vs Manchester City,Chelsea,3.944444,Chelsea,Manchester City
7,Chelsea vs Manchester City,Draw,3.922222,Chelsea,Manchester City
8,Chelsea vs Manchester City,Manchester City,1.857778,Chelsea,Manchester City
9,Everton vs Brighton and Hove Albion,Brighton and Hove Albion,2.6575,Everton,Brighton and Hove Albion


In [446]:


# Initialize the new columns with NaN values
odds_data['Home_odds'] = None
odds_data['Draw_odds'] = None
odds_data['Away_odds'] = None

# Populate the new columns based on the conditions provided
for idx, row in odds_data.iterrows():
    if row['outcome'] == row['home_team']:
        odds_data.at[idx, 'Home_odds'] = row['odds']
    elif row['outcome'] == row['away_team']:
        odds_data.at[idx, 'Away_odds'] = row['odds']
    elif row['outcome'] == 'Draw':
        odds_data.at[idx, 'Draw_odds'] = row['odds']

# Now, we need to group by the match, home_team, and away_team to consolidate the rows
consolidated_odds_df = odds_data.groupby(['match', 'home_team', 'away_team']).agg({
    'Home_odds': 'max',
    'Draw_odds': 'max',
    'Away_odds': 'max'
}).reset_index()

# Display the consolidated DataFrame
consolidated_odds_df

Unnamed: 0,match,home_team,away_team,Home_odds,Draw_odds,Away_odds
0,Arsenal vs Wolverhampton Wanderers,Arsenal,Wolverhampton Wanderers,1.172222,7.716667,15.055556
1,Brentford vs Crystal Palace,Brentford,Crystal Palace,2.425556,3.4,2.914444
2,Chelsea vs Manchester City,Chelsea,Manchester City,3.944444,3.922222,1.857778
3,Everton vs Brighton and Hove Albion,Everton,Brighton and Hove Albion,2.66,3.36125,2.6575
4,Ipswich Town vs Liverpool,Ipswich Town,Liverpool,8.552222,5.897778,1.317778
5,Leicester City vs Tottenham Hotspur,Leicester City,Tottenham Hotspur,5.216667,4.388889,1.587778
6,Manchester United vs Fulham,Manchester United,Fulham,1.637143,4.121429,5.057143
7,Newcastle United vs Southampton,Newcastle United,Southampton,1.32875,5.6675,8.2825
8,Nottingham Forest vs Bournemouth,Nottingham Forest,Bournemouth,2.422222,3.446667,2.886667
9,West Ham United vs Aston Villa,West Ham United,Aston Villa,2.425556,3.664444,2.733333


In [447]:
# Load the Bet Tracking DataFrame
bet_tracking_df = pd.read_csv('/Users/lkimball/Desktop/Flatiron/CapstoneProject/Bet Tracking - Model Inputs.csv')  # Adjust the file path as needed



In [448]:
# Unique team names from both DataFrames
bet_tracking_teams = set(bet_tracking_df['home_team']).union(set(bet_tracking_df['away_team']))
odds_teams = set(consolidated_odds_df['home_team']).union(set(consolidated_odds_df['away_team']))

# Find teams in bet_tracking_df that are not in odds_df
mismatched_teams = bet_tracking_teams.difference(odds_teams)

# Find teams in odds_df that are not in bet_tracking_df
odds_mismatched_teams = odds_teams.difference(bet_tracking_teams)

# Display the mismatched teams
print("Teams in Bet Tracking DataFrame not in Odds DataFrame:", mismatched_teams)
print("Teams in Odds DataFrame not in Bet Tracking DataFrame:", odds_mismatched_teams)


Teams in Bet Tracking DataFrame not in Odds DataFrame: {"Nott'm Forest", 'Man City', 'West Ham', 'Ipswich', 'Brighton', 'Wolves', 'Newcastle', 'Man Utd', 'Spurs', 'Leicester'}
Teams in Odds DataFrame not in Bet Tracking DataFrame: {'Leicester City', 'Brighton and Hove Albion', 'Newcastle United', 'Manchester United', 'Ipswich Town', 'Nottingham Forest', 'West Ham United', 'Manchester City', 'Wolverhampton Wanderers', 'Tottenham Hotspur'}


In [449]:
# Mapping the team names in Bet Tracking DataFrame to match the Odds DataFrame
team_name_mapping = {
    'Brighton': 'Brighton and Hove Albion',
    'Spurs': 'Tottenham Hotspur',
    'Newcastle': 'Newcastle United',
    'Man City': 'Manchester City',
    "Nott'm Forest": 'Nottingham Forest',
    'Man Utd': 'Manchester United',
    'Leicester': 'Leicester City',
    'Ipswich': 'Ipswich Town',
    'Wolves': 'Wolverhampton Wanderers',
    'West Ham': 'West Ham United'
}

# Apply the mapping to the home_team and away_team columns in Bet Tracking DataFrame
bet_tracking_df['home_team'] = bet_tracking_df['home_team'].replace(team_name_mapping)
bet_tracking_df['away_team'] = bet_tracking_df['away_team'].replace(team_name_mapping)



# Now you can try merging the DataFrames again
merged_df = pd.merge(bet_tracking_df, consolidated_odds_df, on=['home_team', 'away_team'], how='left')

# Display the merged DataFrame to check the results
merged_df.head()


Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds,match,Home_odds,Draw_odds,Away_odds
0,2425,1,Manchester United,Fulham,,,,,,,...,0.0,0.0,0.0,,,,Manchester United vs Fulham,1.637143,4.121429,5.057143
1,2425,1,Ipswich Town,Liverpool,,,,,,,...,0.0,0.0,0.0,,,,Ipswich Town vs Liverpool,8.552222,5.897778,1.317778
2,2425,1,Arsenal,Wolverhampton Wanderers,,,,,,,...,0.0,0.0,0.0,,,,Arsenal vs Wolverhampton Wanderers,1.172222,7.716667,15.055556
3,2425,1,Everton,Brighton and Hove Albion,,,,,,,...,0.0,0.0,0.0,,,,Everton vs Brighton and Hove Albion,2.66,3.36125,2.6575
4,2425,1,Newcastle United,Southampton,,,,,,,...,0.0,0.0,0.0,,,,Newcastle United vs Southampton,1.32875,5.6675,8.2825


In [450]:
# Delete the original Pinnacle odds columns
merged_df = merged_df.drop(columns=[
    'Pinnacle Closing Home Win Odds', 
    'Pinnacle Closing Draw Odds', 
    'Pinnacle Closing Away Win Odds',
    'match'
])

# Rename the new odds columns
merged_df = merged_df.rename(columns={
    'Home_odds': 'Pinnacle Closing Home Win Odds',
    'Draw_odds': 'Pinnacle Closing Draw Odds',
    'Away_odds': 'Pinnacle Closing Away Win Odds'
})

merged_df

Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,2425,1,Manchester United,Fulham,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143
1,2425,1,Ipswich Town,Liverpool,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778
2,2425,1,Arsenal,Wolverhampton Wanderers,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556
3,2425,1,Everton,Brighton and Hove Albion,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575
4,2425,1,Newcastle United,Southampton,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825
5,2425,1,Nottingham Forest,Bournemouth,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667
6,2425,1,West Ham United,Aston Villa,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333
7,2425,1,Brentford,Crystal Palace,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444
8,2425,1,Chelsea,Manchester City,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778
9,2425,1,Leicester City,Tottenham Hotspur,,,,,,,...,,,,,,,,5.216667,4.388889,1.587778


## Scraping Lineups ##

In [451]:
import re
import logging
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import Dict

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(name)s] [%(levelname)s] - %(message)s', )
logger = logging.getLogger(__name__)

def create_lineup_dictionary(raw_data) -> Dict[int, str]:
    '''
    Pull the lineup data and put it into an unformatted dictionary.
    :param raw_data: Extracts of HTML from fantasyfootballscout that contain the lineup information.
    :return: A dictionary of the format {0: [player1, player2... player11], ... 20: [...] }
    '''
    logger.info('---- Creating lineup dictionary (unformatted) ----')
    lineup_dictionary = {}
    key = 0

    for team in raw_data:
        lineup_dictionary[key] = team.text.split('  ')[1:]
        logger.info(f'Lineup {key}: {lineup_dictionary[key]}')
        assert len(lineup_dictionary[key]) == 11
        key += 1

    logger.info(f'Final unformatted lineups: {lineup_dictionary}')
    assert len(lineup_dictionary) == 20
    logger.info('---- Created lineup dictionary (unformatted) successfully ---- \n')
    return lineup_dictionary

def create_team_mapper(raw_data) -> Dict[int, str]:
    '''
    Create a dictionary which maps the unformatted lineup data to their respective team names.
    :param raw_data: Extracts of HTML from fantasyfootballscout that contain the team names.
    :return: A dictionary of the format {0: team1, 1: team2, ... 19: team20 }
    '''
    logger.info('---- Creating mapping function ----')
    team_map = {}
    for team in range(2, 22):
        team_map[team-2] = raw_data[team].text.split('Next')[0].strip()
        logger.info(f'{team_map[team-2]} mapped to {team-2}')

    logger.info(f'Final mapping function: {team_map}')
    assert len(team_map) == 20
    logger.info('---- Created mapping function successfully ---- \n')
    return team_map

def create_final_dictionary(lineups_data: Dict[int, str], mapping_dictionary: Dict[int, str]) -> pd.DataFrame:
    '''
    This function uses the mapping dictionary to assign the team names to each predicted lineup
    and store the data in a DataFrame.
    :param lineups_data: A dictionary with keys that match mapping_dictionary to obtain the respective team names.
    :param mapping_dictionary: A dictionary with keys that match the lineups_data to map team names to these values.
    :return: A DataFrame with columns 'team_name' and 'starting_lineup'.
    '''
    logger.info('---- Creating final formatted lineup dictionary ----')
    
    # Create a list of tuples (team_name, starting_lineup)
    formatted_data = [(mapping_dictionary[key], lineups_data[key]) for key in range(0, 20)]
    
    # Convert the list to a DataFrame
    df = pd.DataFrame(formatted_data, columns=['team_name', 'starting_lineup'])
    
    logger.info('---- Created final DataFrame successfully ---- \n')
    return df

def pull_fantasyfootballscout_lineups(url='https://www.fantasyfootballscout.co.uk/team-news/') -> pd.DataFrame:
    '''
    This function scrapes the latest predicted lineups from fantasyfootballscout.
    :param url: The url for where the predicted lineups are located.
    :return: A DataFrame with columns 'team_name' and 'starting_lineup'.
    '''
    # Scrape the raw HTML and pass it into BeautifulSoup
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    # Raw data related to the teams predicted lineup / formation
    lineups_raw = soup.find_all("div", {"class": re.compile('formation.*')})
    # Raw data which includes team names
    team_names_raw = soup.find_all('header')

    # Parse all of the raw data
    lineup_dictionary = create_lineup_dictionary(raw_data=lineups_raw)
    team_names_mapper = create_team_mapper(raw_data=team_names_raw)

    # Match the team lineups to their respective team names and create a DataFrame
    final_df = create_final_dictionary(lineups_data=lineup_dictionary,
                                       mapping_dictionary=team_names_mapper)

    return final_df

if __name__ == '__main__':
    logger.info('---- Pulling lineup predictions from Fantasy Football Scout ---- \n')
    lineups_df = pull_fantasyfootballscout_lineups()
    logger.info('---- Lineup predictions successfully pulled from Fantasy Football Scout ---- \n')
    print(lineups_df)  # Print the final DataFrame to the console


                   team_name  \
0                    Arsenal   
1                Aston Villa   
2                Bournemouth   
3                  Brentford   
4   Brighton and Hove Albion   
5                    Chelsea   
6             Crystal Palace   
7                    Everton   
8                     Fulham   
9               Ipswich Town   
10            Leicester City   
11                 Liverpool   
12           Manchester City   
13         Manchester United   
14          Newcastle United   
15         Nottingham Forest   
16               Southampton   
17         Tottenham Hotspur   
18           West Ham United   
19   Wolverhampton Wanderers   

                                      starting_lineup  
0   [Raya Martin, White, Saliba, Gabriel, Zinchenk...  
1   [Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...  
2   [Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...  
3   [Flekken, Roerslev, Collins, Pinnock, Ajer, Je...  
4   [Steele, Veltman, van Hecke, Dunk, Barco, W

In [452]:
lineups_df

Unnamed: 0,team_name,starting_lineup
0,Arsenal,"[Raya Martin, White, Saliba, Gabriel, Zinchenk..."
1,Aston Villa,"[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B..."
2,Bournemouth,"[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ..."
3,Brentford,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je..."
4,Brighton and Hove Albion,"[Steele, Veltman, van Hecke, Dunk, Barco, Wief..."
5,Chelsea,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F..."
6,Crystal Palace,"[Henderson, Richards, Andersen, Riad, Muñoz, H..."
7,Everton,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk..."
8,Fulham,"[Leno, Castagne, Diop, Bassey, Robinson, Lukic..."
9,Ipswich Town,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson..."


In [453]:
# Assuming merged_df and lineups_df are already defined from the previous steps

# Create dictionaries for quick lookup of lineups by team name
lineup_dict = lineups_df.set_index('team_name')['starting_lineup'].to_dict()

# Populate home_starters and away_starters based on the team names
merged_df['home_starters'] = merged_df['home_team'].map(lineup_dict)
merged_df['away_starters'] = merged_df['away_team'].map(lineup_dict)



In [454]:
merged_df

Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,2425,1,Manchester United,Fulham,,,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143
1,2425,1,Ipswich Town,Liverpool,,,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778
2,2425,1,Arsenal,Wolverhampton Wanderers,,,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556
3,2425,1,Everton,Brighton and Hove Albion,,,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575
4,2425,1,Newcastle United,Southampton,,,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825
5,2425,1,Nottingham Forest,Bournemouth,,,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667
6,2425,1,West Ham United,Aston Villa,,,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333
7,2425,1,Brentford,Crystal Palace,,,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444
8,2425,1,Chelsea,Manchester City,,,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778
9,2425,1,Leicester City,Tottenham Hotspur,,,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",,,...,,,,,,,,5.216667,4.388889,1.587778


In [455]:
import pandas as pd

# Load the MBAPPE ratings for the 23/24 season
mbappe_2324 = pd.read_csv('/Users/lkimball/Desktop/Flatiron/CapstoneProject/MBAPPE_Ratings/ratings_23-24.csv')



# Step 1: Extract the primary team name from mbappe_2324
mbappe_2324['Primary_Team'] = mbappe_2324['Team'].apply(lambda x: x.split(',')[0].strip().lower())

# Step 2: Normalize team names in merged_df and ensure consistency
merged_df['home_team'] = merged_df['home_team'].str.lower().str.strip()
merged_df['away_team'] = merged_df['away_team'].str.lower().str.strip()

# Check for mismatches again after primary team extraction
teams_in_merged = set(merged_df['home_team']).union(set(merged_df['away_team']))
teams_in_mbappe = set(mbappe_2324['Primary_Team'])

mismatched_teams_in_merged = teams_in_merged - teams_in_mbappe
mismatched_teams_in_mbappe = teams_in_mbappe - teams_in_merged

print("Teams in merged_df not in mbappe_2324:", mismatched_teams_in_merged)
print("Teams in mbappe_2324 not in merged_df:", mismatched_teams_in_mbappe)



Teams in merged_df not in mbappe_2324: {'west ham united', 'brighton and hove albion', 'southampton', 'ipswich town', 'leicester city', 'wolverhampton wanderers'}
Teams in mbappe_2324 not in merged_df: {'westerlo', 'real betis', 'borussia mönchengladbach', 'hoffenheim', 'valencia', 'go ahead eagles', 'mallorca', 'anderlecht', 'rb leipzig', 'heidenheim', 'lille', 'nantes', 'celtic', 'hellas verona', 'braga', 'brighton', 'cagliari', 'getafe', 'olympiacos', 'fc copenhagen', 'rwd molenbeek', 'casa pia', 'famalicão', 'celta vigo', 'oh leuven', 'luton town', 'almere city', 'eintracht frankfurt', 'athletic bilbao', 'aris limassol fc', 'metz', 'tsc bačka top', 'fortuna sittard', 'almería', 'farense', 'vitória', 'köln', 'paris saint-germain', 'antwerp', 'monza', 'union saint-gilloise', 'fiorentina', 'darmstadt 98', 'strasbourg', 'salernitana', 'vitesse', 'granada', 'barcelona', 'rb salzburg', 'utrecht', 'mechelen', 'las palmas', 'girona', 'volendam', 'nec nijmegen', 'rkc waalwijk', 'cádiz', 'ko

In [456]:
# Step 3: Map any remaining inconsistent team names
team_name_mapping = {
    'leicester city': 'leicester',  # Leicester City -> Leicester
    'wolverhampton wanderers': 'wolves',  # Wolverhampton Wanderers -> Wolves
    'west ham united': 'west ham',  # West Ham United -> West Ham
    'ipswich town': 'ipswich',  # Ipswich Town -> Ipswich
    'brighton and hove albion': 'brighton',  # Brighton and Hove Albion -> Brighton
    'southampton': 'southampton',  # Southampton -> Southampton (no change)
    'spurs': 'tottenham hotspur',  # Spurs -> Tottenham Hotspur
    'newcastle': 'newcastle united',  # Newcastle -> Newcastle United
    'man city': 'manchester city',  # Man City -> Manchester City
    'man utd': 'manchester united',  # Man Utd -> Manchester United
    "nott'm forest": 'nottingham forest'  # Nott'm Forest -> Nottingham Forest
}


# Apply the mapping to the merged_df
merged_df['home_team'] = merged_df['home_team'].replace(team_name_mapping)
merged_df['away_team'] = merged_df['away_team'].replace(team_name_mapping)

# Re-check for mismatches
teams_in_merged = set(merged_df['home_team']).union(set(merged_df['away_team']))
teams_in_mbappe = set(mbappe_2324['Primary_Team'])

mismatched_teams_in_merged = teams_in_merged - teams_in_mbappe
mismatched_teams_in_mbappe = teams_in_mbappe - teams_in_merged

print("After mapping - Teams in merged_df not in mbappe_2324:", mismatched_teams_in_merged)
print("After mapping - Teams in mbappe_2324 not in merged_df:", mismatched_teams_in_mbappe)


After mapping - Teams in merged_df not in mbappe_2324: {'leicester', 'southampton', 'ipswich'}
After mapping - Teams in mbappe_2324 not in merged_df: {'westerlo', 'real betis', 'borussia mönchengladbach', 'hoffenheim', 'valencia', 'go ahead eagles', 'mallorca', 'anderlecht', 'rb leipzig', 'heidenheim', 'lille', 'nantes', 'celtic', 'hellas verona', 'braga', 'cagliari', 'getafe', 'olympiacos', 'fc copenhagen', 'rwd molenbeek', 'casa pia', 'famalicão', 'celta vigo', 'oh leuven', 'luton town', 'almere city', 'eintracht frankfurt', 'athletic bilbao', 'aris limassol fc', 'metz', 'tsc bačka top', 'fortuna sittard', 'almería', 'farense', 'vitória', 'köln', 'paris saint-germain', 'antwerp', 'monza', 'union saint-gilloise', 'fiorentina', 'darmstadt 98', 'strasbourg', 'salernitana', 'vitesse', 'granada', 'barcelona', 'rb salzburg', 'utrecht', 'mechelen', 'las palmas', 'girona', 'volendam', 'nec nijmegen', 'rkc waalwijk', 'cádiz', 'kortrijk', 'real madrid', 'twente', 'rks raków', 'lazio', 'stade b

In [457]:
print(mbappe_2324.columns)


Index(['Player', 'MBAPPE', 'O-MBAPPE', 'D-MBAPPE', 'eLPA', 'Born', 'Age',
       'Competition', 'Team', 'Nation', 'Primary Position',
       'Secondary Positions', 'SCA%', 'PRG%', 'TO%', 'FIN%', 'INT%', 'TKL%',
       'SP%', 'AER%', 'USG%', 'DENY%', 'Primary_Team'],
      dtype='object')


In [458]:
def calculate_team_strength(players, team_name, mbappe_df):
    if players is None or team_name is None:
        return None

    # Convert player names to lowercase and strip spaces for matching
    players = [player.lower().strip() for player in players]
    team_name = team_name.lower().strip()

    # Filter the MBAPPE DataFrame by matching both the team name and player last names
    team_ratings = mbappe_df[mbappe_df['Primary_Team'].str.lower() == team_name]

    # Extract the last names from the 'Player' column in the mbappe_df
    team_ratings['Player_last_name'] = team_ratings['Player'].str.split().str[-1].str.lower()

    # Match the player last names with the starters
    matched_players = team_ratings[team_ratings['Player_last_name'].isin(players)]

    # Return the sum of MBAPPE values for the matched players
    return matched_players['MBAPPE'].sum() if not matched_players.empty else None

# Calculate MBAPPE values for home and away teams
merged_df['home_team_strength'] = merged_df.apply(
    lambda row: calculate_team_strength(row['home_starters'], row['home_team'], mbappe_2324), axis=1
)

merged_df['away_team_strength'] = merged_df.apply(
    lambda row: calculate_team_strength(row['away_starters'], row['away_team'], mbappe_2324), axis=1
)

# Display the result to verify
print(merged_df[['home_team', 'home_team_strength', 'away_team', 'away_team_strength']].head())


           home_team  home_team_strength    away_team  away_team_strength
0  manchester united               0.282       fulham               0.330
1            ipswich                 NaN    liverpool               0.755
2            arsenal               0.954       wolves               0.020
3            everton               0.302     brighton               0.329
4   newcastle united               0.432  southampton                 NaN


In [459]:
merged_df

Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,2425,1,manchester united,fulham,,,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143
1,2425,1,ipswich,liverpool,,,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",,0.755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778
2,2425,1,arsenal,wolves,,,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556
3,2425,1,everton,brighton,,,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575
4,2425,1,newcastle united,southampton,,,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825
5,2425,1,nottingham forest,bournemouth,,,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667
6,2425,1,west ham,aston villa,,,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333
7,2425,1,brentford,crystal palace,,,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444
8,2425,1,chelsea,manchester city,,,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778
9,2425,1,leicester,tottenham hotspur,,,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",,0.528,...,,,,,,,,5.216667,4.388889,1.587778


In [460]:
# Fill NaN values with 0 in the team strength columns
merged_df['home_team_strength'].fillna(0, inplace=True)
merged_df['away_team_strength'].fillna(0, inplace=True)

merged_df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['home_team_strength'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['away_team_strength'].fillna(0, inplace=True)


Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,2425,1,manchester united,fulham,,,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143
1,2425,1,ipswich,liverpool,,,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",0.0,0.755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778
2,2425,1,arsenal,wolves,,,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556
3,2425,1,everton,brighton,,,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575
4,2425,1,newcastle united,southampton,,,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825
5,2425,1,nottingham forest,bournemouth,,,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667
6,2425,1,west ham,aston villa,,,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333
7,2425,1,brentford,crystal palace,,,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444
8,2425,1,chelsea,manchester city,,,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778
9,2425,1,leicester,tottenham hotspur,,,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",0.0,0.528,...,,,,,,,,5.216667,4.388889,1.587778


In [461]:
import soccerdata as sd

#instantiating the class
elo = sd.ClubElo()
print(elo.__doc__)

Provides pd.DataFrames from CSV API at http://api.clubelo.com.

    Data will be downloaded as necessary and cached locally in
    ``~/soccerdata/data/ClubElo``.

    Since the source does not provide league names, this class will not filter
    by league. League names will be inserted from the other sources where
    available. Leagues that are only covered by clubelo.com will have NaN
    values.

    Parameters
    ----------
    proxy : 'tor' or dict or list(dict) or callable, optional
        Use a proxy to hide your IP address. Valid options are:
            - "tor": Uses the Tor network. Tor should be running in
              the background on port 9050.
            - dict: A dictionary with the proxy to use. The dict should be
              a mapping of supported protocols to proxy addresses. For example::

                  {
                      'http': 'http://10.10.1.10:3128',
                      'https': 'http://10.10.1.10:1080',
                  }

            - list(

In [462]:
from datetime import datetime

# Get today's date-time in the format required by the method
today = datetime.now().strftime('%Y-%m-%d')

# Fetch the Elo ratings as of today's date-time
current_elo = elo.read_by_date(today)

# Display the first few rows to inspect the data
print(current_elo.head())


             rank country  level          elo       from         to  \
team                                                                  
Man City      1.0     ENG      1  2050.554688 2024-05-20 2024-08-18   
Real Madrid   2.0     ESP      1  1997.407959 2024-06-02 2024-08-18   
Inter         3.0     ITA      1  1964.768311 2024-05-30 2024-08-17   
Arsenal       4.0     ENG      1  1946.879517 2024-05-23 2024-08-17   
Leverkusen    5.0     GER      1  1925.011475 2024-06-02 2024-08-23   

                         league  
team                             
Man City     ENG-Premier League  
Real Madrid         ESP-La Liga  
Inter               ITA-Serie A  
Arsenal      ENG-Premier League  
Leverkusen       GER-Bundesliga  


In [463]:
# Step 1: Filter the elo_df for Premier League teams
premier_league_elo = current_elo[current_elo['league'] == 'ENG-Premier League']

# Step 2: Extract team names from both DataFrames
teams_in_merged_df = set(merged_df['home_team'].unique()).union(set(merged_df['away_team'].unique()))
teams_in_elo_df = set(premier_league_elo.index)

# Step 3: Identify mismatches between the two sets of team names
teams_not_in_elo = teams_in_merged_df - teams_in_elo_df
teams_not_in_merged = teams_in_elo_df - teams_in_merged_df

print("Teams in merged_df not in Elo DataFrame:", teams_not_in_elo)
print("Teams in Elo DataFrame not in merged_df:", teams_not_in_merged)

# After this, we can proceed to create the necessary mappings for any mismatches.


Teams in merged_df not in Elo DataFrame: {'ipswich', 'crystal palace', 'manchester city', 'manchester united', 'chelsea', 'brighton', 'bournemouth', 'wolves', 'tottenham hotspur', 'nottingham forest', 'everton', 'fulham', 'southampton', 'liverpool', 'brentford', 'arsenal', 'newcastle united', 'west ham', 'leicester', 'aston villa'}
Teams in Elo DataFrame not in merged_df: {'Everton', 'Southampton', 'Arsenal', 'Forest', 'Brighton', 'Crystal Palace', 'Man United', 'West Ham', 'Wolves', 'Ipswich', 'Chelsea', 'Liverpool', 'Man City', 'Bournemouth', 'Newcastle', 'Brentford', 'Aston Villa', 'Fulham', 'Tottenham', 'Leicester'}


In [464]:
team_mapping = {
    'tottenham hotspur': 'Tottenham',
    'manchester united': 'Man United',
    'fulham': 'Fulham',
    'aston villa': 'Aston Villa',
    'chelsea': 'Chelsea',
    'everton': 'Everton',
    'brighton': 'Brighton',
    'nottingham forest': 'Forest',
    'west ham': 'West Ham',
    'bournemouth': 'Bournemouth',
    'manchester city': 'Man City',
    'leicester': 'Leicester',
    'ipswich': 'Ipswich',
    'arsenal': 'Arsenal',
    'brentford': 'Brentford',
    'wolves': 'Wolves',
    'crystal palace': 'Crystal Palace',
    'liverpool': 'Liverpool',
    'newcastle united': 'Newcastle',
    'southampton': 'Southampton'
}

# Apply the mapping to the home_team and away_team columns
merged_df['home_team'] = merged_df['home_team'].replace(team_mapping)
merged_df['away_team'] = merged_df['away_team'].replace(team_mapping)

# Now, you can proceed to merge the Elo ratings with the merged_df based on the mapped team names


In [465]:
merged_df

Unnamed: 0,season,week,home_team,away_team,home_team_elo,away_team_elo,home_starters,away_starters,home_team_strength,away_team_strength,...,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,2425,1,Man United,Fulham,,,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143
1,2425,1,Ipswich,Liverpool,,,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",0.0,0.755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778
2,2425,1,Arsenal,Wolves,,,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556
3,2425,1,Everton,Brighton,,,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575
4,2425,1,Newcastle,Southampton,,,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825
5,2425,1,Forest,Bournemouth,,,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667
6,2425,1,West Ham,Aston Villa,,,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333
7,2425,1,Brentford,Crystal Palace,,,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444
8,2425,1,Chelsea,Man City,,,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778
9,2425,1,Leicester,Tottenham,,,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",0.0,0.528,...,,,,,,,,5.216667,4.388889,1.587778


In [466]:
# Drop the existing ELO columns if they exist
if 'home_team_elo' in merged_df.columns:
    merged_df.drop(columns=['home_team_elo'], inplace=True)
if 'away_team_elo' in merged_df.columns:
    merged_df.drop(columns=['away_team_elo'], inplace=True)

# Now, merge the Elo ratings with the merged_df
merged_df = merged_df.merge(premier_league_elo[['elo']], left_on='home_team', right_index=True, how='left')
merged_df.rename(columns={'elo': 'home_team_elo'}, inplace=True)

merged_df = merged_df.merge(premier_league_elo[['elo']], left_on='away_team', right_index=True, how='left')
merged_df.rename(columns={'elo': 'away_team_elo'}, inplace=True)

# Final check to ensure the merge was successful
print(merged_df[['home_team', 'home_team_elo', 'away_team', 'away_team_elo']].head())


    home_team  home_team_elo    away_team  away_team_elo
0  Man United    1779.007568       Fulham    1716.248901
1     Ipswich    1568.301392    Liverpool    1900.663574
2     Arsenal    1946.879517       Wolves    1677.831177
3     Everton    1706.826660     Brighton    1713.139038
4   Newcastle    1801.777100  Southampton    1599.588501


In [467]:
merged_df.columns

Index(['season', 'week', 'home_team', 'away_team', 'home_starters',
       'away_starters', 'home_team_strength', 'away_team_strength',
       'home_xG_to_date', 'away_xG_to_date', 'home_xG_against_to_date',
       'away_xG_against_to_date', 'home_goals_scored', 'away_goals_scored',
       'home_goals_scored_to_date', 'away_goals_scored_to_date',
       'home_goals_conceded_to_date', 'away_goals_conceded_to_date',
       'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form',
       'Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds',
       'Pinnacle Closing Away Win Odds', 'home_team_elo', 'away_team_elo'],
      dtype='object')

In [468]:
# Step 1: Impute NaN values with 0s
merged_df.fillna(0, inplace=True)

# Step 2: Drop unnecessary columns
# Specify the columns you want to drop
columns_to_drop = ['season','week','home_team','away_team','home_starters','away_starters']  # Replace with actual column names

# Drop the specified columns and store the result in a new DataFrame
model_df = merged_df.drop(columns=columns_to_drop, axis=1)

# Proceed to model prediction (from the previous step)


In [469]:
model_df

Unnamed: 0,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,home_xG_against_to_date,away_xG_against_to_date,home_goals_scored,away_goals_scored,home_goals_scored_to_date,away_goals_scored_to_date,...,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds,home_team_elo,away_team_elo
0,0.282,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.637143,4.121429,5.057143,1779.007568,1716.248901
1,0.0,0.755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.552222,5.897778,1.317778,1568.301392,1900.663574
2,0.954,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.172222,7.716667,15.055556,1946.879517,1677.831177
3,0.302,0.329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.66,3.36125,2.6575,1706.82666,1713.139038
4,0.432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.32875,5.6675,8.2825,1801.7771,1599.588501
5,0.188,0.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.422222,3.446667,2.886667,1647.685791,1691.102661
6,0.176,0.565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.425556,3.664444,2.733333,1726.226807,1770.394653
7,0.319,0.089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.425556,3.4,2.914444,1711.060181,1759.694092
8,0.543,0.772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.944444,3.922222,1.857778,1810.097046,2050.554688
9,0.0,0.528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.216667,4.388889,1.587778,1643.654053,1790.5354


## Inputing data into the model ##

In [470]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

# Load the model
with open('final_ensemble_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Manually specify the columns used during training
original_columns = [
    'home_team_elo', 'away_team_elo', 'home_team_strength',
    'away_team_strength', 'home_xG_to_date', 'away_xG_to_date',
    'home_xG_against_to_date', 'away_xG_against_to_date',
    'home_goals_scored_to_date', 'away_goals_scored_to_date',
    'home_goals_conceded_to_date', 'away_goals_conceded_to_date',
    'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form',
    'Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds', 'Pinnacle Closing Away Win Odds'
]

# Align the columns of the new data to the original training columns
aligned_new_data = model_df.reindex(columns=original_columns, fill_value=0)

# Standardize the features
scaler = StandardScaler()
aligned_new_data_scaled = scaler.fit_transform(aligned_new_data)

# Predict using the loaded model
predicted_probabilities = loaded_model.predict_proba(aligned_new_data_scaled)

# Convert to DataFrame for easier interpretation
predicted_probabilities_df = pd.DataFrame(predicted_probabilities, columns=['Home Win', 'Draw', 'Away Win'])

print(predicted_probabilities_df.head())


   Home Win      Draw  Away Win
0  0.600091  0.225419  0.174490
1  0.062090  0.193727  0.744183
2  0.810854  0.145313  0.043833
3  0.384044  0.272188  0.343767
4  0.672653  0.190404  0.136943


In [471]:
# Add the predicted probabilities to the merged_df
merged_df['Home Win Probability'] = predicted_probabilities_df['Home Win']
merged_df['Draw Probability'] = predicted_probabilities_df['Draw']
merged_df['Away Win Probability'] = predicted_probabilities_df['Away Win']

# Check the updated DataFrame
merged_df


Unnamed: 0,season,week,home_team,away_team,home_starters,away_starters,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,...,home_form,away_form,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds,home_team_elo,away_team_elo,Home Win Probability,Draw Probability,Away Win Probability
0,2425,1,Man United,Fulham,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,0.0,0.0,...,0.0,0.0,1.637143,4.121429,5.057143,1779.007568,1716.248901,0.600091,0.225419,0.17449
1,2425,1,Ipswich,Liverpool,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",0.0,0.755,0.0,0.0,...,0.0,0.0,8.552222,5.897778,1.317778,1568.301392,1900.663574,0.06209,0.193727,0.744183
2,2425,1,Arsenal,Wolves,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,0.0,0.0,...,0.0,0.0,1.172222,7.716667,15.055556,1946.879517,1677.831177,0.810854,0.145313,0.043833
3,2425,1,Everton,Brighton,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,0.0,0.0,...,0.0,0.0,2.66,3.36125,2.6575,1706.82666,1713.139038,0.384044,0.272188,0.343767
4,2425,1,Newcastle,Southampton,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,0.0,0.0,0.0,...,0.0,0.0,1.32875,5.6675,8.2825,1801.7771,1599.588501,0.672653,0.190404,0.136943
5,2425,1,Forest,Bournemouth,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,0.0,0.0,...,0.0,0.0,2.422222,3.446667,2.886667,1647.685791,1691.102661,0.37284,0.277797,0.349363
6,2425,1,West Ham,Aston Villa,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,0.0,0.0,...,0.0,0.0,2.425556,3.664444,2.733333,1726.226807,1770.394653,0.381206,0.276716,0.342078
7,2425,1,Brentford,Crystal Palace,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,0.0,0.0,...,0.0,0.0,2.425556,3.4,2.914444,1711.060181,1759.694092,0.425576,0.26914,0.305285
8,2425,1,Chelsea,Man City,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,0.0,0.0,...,0.0,0.0,3.944444,3.922222,1.857778,1810.097046,2050.554688,0.325115,0.237821,0.437063
9,2425,1,Leicester,Tottenham,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",0.0,0.528,0.0,0.0,...,0.0,0.0,5.216667,4.388889,1.587778,1643.654053,1790.5354,0.14362,0.212247,0.644132


## Comparing Odds ##

In [472]:
import pandas as pd

# Load the odds data from the CSV file
df_odds = pd.read_csv('epl_reshaped_odds.csv')

In [473]:
df_odds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   home_team                   10 non-null     object 
 1   away_team                   10 non-null     object 
 2   BetMGM_away_win_odds        8 non-null      float64
 3   BetMGM_draw_odds            8 non-null      float64
 4   BetMGM_home_win_odds        8 non-null      float64
 5   BetOnline.ag_away_win_odds  9 non-null      float64
 6   BetOnline.ag_draw_odds      9 non-null      float64
 7   BetOnline.ag_home_win_odds  9 non-null      float64
 8   BetRivers_away_win_odds     10 non-null     float64
 9   BetRivers_draw_odds         10 non-null     float64
 10  BetRivers_home_win_odds     10 non-null     float64
 11  BetUS_away_win_odds         10 non-null     float64
 12  BetUS_draw_odds             10 non-null     float64
 13  BetUS_home_win_odds         10 non-nul

In [474]:
df_odds

Unnamed: 0,home_team,away_team,BetMGM_away_win_odds,BetMGM_draw_odds,BetMGM_home_win_odds,BetOnline.ag_away_win_odds,BetOnline.ag_draw_odds,BetOnline.ag_home_win_odds,BetRivers_away_win_odds,BetRivers_draw_odds,...,Caesars_home_win_odds,DraftKings_away_win_odds,DraftKings_draw_odds,DraftKings_home_win_odds,FanDuel_away_win_odds,FanDuel_draw_odds,FanDuel_home_win_odds,LowVig.ag_away_win_odds,LowVig.ag_draw_odds,LowVig.ag_home_win_odds
0,Arsenal,Wolverhampton Wanderers,13.5,7.25,1.2,15.0,8.4,1.18,17.0,7.5,...,1.17,14.0,8.0,1.18,15.0,7.5,1.15,15.0,8.4,1.18
1,Brentford,Crystal Palace,2.85,3.4,2.4,2.9,3.45,2.5,2.88,3.4,...,2.35,2.95,3.4,2.45,2.95,3.4,2.4,2.9,3.45,2.5
2,Chelsea,Manchester City,1.83,4.0,3.9,1.88,3.9,4.05,1.85,3.8,...,4.0,1.87,3.9,3.95,1.87,3.9,3.9,1.88,3.9,4.05
3,Everton,Brighton and Hove Albion,,,,2.7,3.39,2.7,2.6,3.45,...,2.7,2.7,3.35,2.65,2.65,3.3,2.65,2.7,3.39,2.7
4,Ipswich Town,Liverpool,1.33,5.75,8.0,1.33,5.85,8.8,1.3,6.25,...,9.0,1.32,6.0,8.0,1.31,5.9,8.5,1.33,5.85,8.8
5,Leicester City,Tottenham Hotspur,1.62,4.2,4.8,1.62,4.35,5.25,1.56,4.5,...,5.25,1.57,4.4,5.25,1.57,4.5,5.5,1.62,4.35,5.25
6,Manchester United,Fulham,4.8,4.2,1.62,,,,5.5,4.1,...,1.59,4.9,4.1,1.67,5.2,4.0,1.67,,,
7,Newcastle United,Southampton,,,,8.38,5.67,1.35,8.5,5.8,...,1.31,8.0,5.75,1.32,8.0,5.7,1.32,8.38,5.67,1.35
8,Nottingham Forest,Bournemouth,2.8,3.4,2.45,2.95,3.46,2.46,2.88,3.45,...,2.35,2.95,3.35,2.45,2.9,3.4,2.4,2.95,3.46,2.46
9,West Ham United,Aston Villa,2.7,3.6,2.45,2.8,3.7,2.45,2.75,3.7,...,2.4,2.75,3.65,2.45,2.75,3.6,2.4,2.8,3.7,2.45


In [475]:
# List the unique team names from both DataFrames
teams_in_merged_df = set(merged_df['home_team'].unique()).union(set(merged_df['away_team'].unique()))
teams_in_odds_df = set(df_odds['home_team'].unique()).union(set(df_odds['away_team'].unique()))

# Identify teams in merged_df that are not in odds_df
teams_not_in_odds_df = teams_in_merged_df - teams_in_odds_df
# Identify teams in odds_df that are not in merged_df
teams_not_in_merged_df = teams_in_odds_df - teams_in_merged_df

print("Teams in merged_df not in odds_df:", teams_not_in_odds_df)
print("Teams in odds_df not in merged_df:", teams_not_in_merged_df)


Teams in merged_df not in odds_df: {'Man United', 'Man City', 'Tottenham', 'West Ham', 'Forest', 'Brighton', 'Wolves', 'Newcastle', 'Ipswich', 'Leicester'}
Teams in odds_df not in merged_df: {'Leicester City', 'Brighton and Hove Albion', 'Newcastle United', 'Manchester United', 'Ipswich Town', 'Nottingham Forest', 'West Ham United', 'Manchester City', 'Wolverhampton Wanderers', 'Tottenham Hotspur'}


In [476]:
# Define the mapping dictionary
team_name_mapping = {
    'Brighton': 'Brighton and Hove Albion',
    'Newcastle': 'Newcastle United',
    'Tottenham': 'Tottenham Hotspur',
    'Man City': 'Manchester City',
    'Forest': 'Nottingham Forest',
    'Leicester': 'Leicester City',
    'Ipswich': 'Ipswich Town',
    'Wolves': 'Wolverhampton Wanderers',
    'Man United': 'Manchester United',
    'West Ham': 'West Ham United'
}

# Apply the mapping to the merged_df
merged_df['home_team'] = merged_df['home_team'].replace(team_name_mapping)
merged_df['away_team'] = merged_df['away_team'].replace(team_name_mapping)

# Now you can proceed with merging the DataFrames or calculating the expected values.


In [477]:
print(merged_df.columns)


Index(['season', 'week', 'home_team', 'away_team', 'home_starters',
       'away_starters', 'home_team_strength', 'away_team_strength',
       'home_xG_to_date', 'away_xG_to_date', 'home_xG_against_to_date',
       'away_xG_against_to_date', 'home_goals_scored', 'away_goals_scored',
       'home_goals_scored_to_date', 'away_goals_scored_to_date',
       'home_goals_conceded_to_date', 'away_goals_conceded_to_date',
       'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form',
       'Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds',
       'Pinnacle Closing Away Win Odds', 'home_team_elo', 'away_team_elo',
       'Home Win Probability', 'Draw Probability', 'Away Win Probability'],
      dtype='object')


In [478]:
# Assuming 'odds_df' is the DataFrame with the odds data and 'merged_df' contains your current data

# Merge the DataFrames on home_team and away_team
merged_df = pd.merge(merged_df, df_odds, on=['home_team', 'away_team'], how='left')

# Check if the merge was successful by inspecting the first few rows
print(merged_df.head())


   season  week          home_team                 away_team  \
0    2425     1  Manchester United                    Fulham   
1    2425     1       Ipswich Town                 Liverpool   
2    2425     1            Arsenal   Wolverhampton Wanderers   
3    2425     1            Everton  Brighton and Hove Albion   
4    2425     1   Newcastle United               Southampton   

                                       home_starters  \
0  [Onana, Dalot, Maguire, Evans, Martínez, Casem...   
1  [Muric, Tuanzebe, Woolfenden, Greaves, Johnson...   
2  [Raya Martin, White, Saliba, Gabriel, Zinchenk...   
3  [Pickford, Young, Tarkowski, O'Brien, Mykolenk...   
4  [Pope, Livramento, Schär, Burn, Lewis Hall, Lo...   

                                       away_starters  home_team_strength  \
0  [Leno, Castagne, Diop, Bassey, Robinson, Lukic...               0.282   
1  [Alisson, Alexander-Arnold, Quansah, van Dijk,...               0.000   
2  [José Sá, Doherty, Mosquera, Toti, Ait Nouri, .

In [479]:
merged_df

Unnamed: 0,season,week,home_team,away_team,home_starters,away_starters,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,...,Caesars_home_win_odds,DraftKings_away_win_odds,DraftKings_draw_odds,DraftKings_home_win_odds,FanDuel_away_win_odds,FanDuel_draw_odds,FanDuel_home_win_odds,LowVig.ag_away_win_odds,LowVig.ag_draw_odds,LowVig.ag_home_win_odds
0,2425,1,Manchester United,Fulham,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,0.0,0.0,...,1.59,4.9,4.1,1.67,5.2,4.0,1.67,,,
1,2425,1,Ipswich Town,Liverpool,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",0.0,0.755,0.0,0.0,...,9.0,1.32,6.0,8.0,1.31,5.9,8.5,1.33,5.85,8.8
2,2425,1,Arsenal,Wolverhampton Wanderers,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,0.0,0.0,...,1.17,14.0,8.0,1.18,15.0,7.5,1.15,15.0,8.4,1.18
3,2425,1,Everton,Brighton and Hove Albion,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,0.0,0.0,...,2.7,2.7,3.35,2.65,2.65,3.3,2.65,2.7,3.39,2.7
4,2425,1,Newcastle United,Southampton,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,0.0,0.0,0.0,...,1.31,8.0,5.75,1.32,8.0,5.7,1.32,8.38,5.67,1.35
5,2425,1,Nottingham Forest,Bournemouth,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,0.0,0.0,...,2.35,2.95,3.35,2.45,2.9,3.4,2.4,2.95,3.46,2.46
6,2425,1,West Ham United,Aston Villa,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,0.0,0.0,...,2.4,2.75,3.65,2.45,2.75,3.6,2.4,2.8,3.7,2.45
7,2425,1,Brentford,Crystal Palace,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,0.0,0.0,...,2.35,2.95,3.4,2.45,2.95,3.4,2.4,2.9,3.45,2.5
8,2425,1,Chelsea,Manchester City,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,0.0,0.0,...,4.0,1.87,3.9,3.95,1.87,3.9,3.9,1.88,3.9,4.05
9,2425,1,Leicester City,Tottenham Hotspur,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",0.0,0.528,0.0,0.0,...,5.25,1.57,4.4,5.25,1.57,4.5,5.5,1.62,4.35,5.25


In [480]:
import numpy as np

In [481]:
# Define the list of bookmakers
bookmakers = ['BetMGM', 'BetRivers', 'BetUS', 'Bovada', 'Caesars','FanDuel']

# Initialize an empty list to store the implied probabilities
implied_probabilities = []

# Loop through each bookmaker and calculate the implied probabilities
for bookmaker in bookmakers:
    for outcome in ['home_win', 'draw', 'away_win']:
        odds_column = f'{bookmaker}_{outcome}_odds'
        implied_prob_column = f'{bookmaker}_{outcome}_implied_prob'
        
        # Calculate the implied probability
        if odds_column in merged_df.columns:
            merged_df[implied_prob_column] = merged_df.apply(
                lambda row: 1 / row[odds_column] if not pd.isnull(row[odds_column]) and row[odds_column] > 0 else np.nan,
                axis=1
            )
            implied_probabilities.append(implied_prob_column)

# Now, compare your predicted probabilities with the implied probabilities
comparison_columns = [
    'home_team', 
    'away_team', 
    'Home Win Probability', 
    'Draw Probability', 
    'Away Win Probability'
] + [col for col in implied_probabilities if col in merged_df.columns]

# Display the comparison
print(merged_df[comparison_columns].head())


           home_team                 away_team  Home Win Probability  \
0  Manchester United                    Fulham              0.600091   
1       Ipswich Town                 Liverpool              0.062090   
2            Arsenal   Wolverhampton Wanderers              0.810854   
3            Everton  Brighton and Hove Albion              0.384044   
4   Newcastle United               Southampton              0.672653   

   Draw Probability  Away Win Probability  BetMGM_home_win_implied_prob  \
0          0.225419              0.174490                      0.617284   
1          0.193727              0.744183                      0.125000   
2          0.145313              0.043833                      0.833333   
3          0.272188              0.343767                           NaN   
4          0.190404              0.136943                           NaN   

   BetMGM_draw_implied_prob  BetMGM_away_win_implied_prob  \
0                  0.238095                      0.2083

In [482]:
merged_df[comparison_columns]

Unnamed: 0,home_team,away_team,Home Win Probability,Draw Probability,Away Win Probability,BetMGM_home_win_implied_prob,BetMGM_draw_implied_prob,BetMGM_away_win_implied_prob,BetRivers_home_win_implied_prob,BetRivers_draw_implied_prob,...,BetUS_away_win_implied_prob,Bovada_home_win_implied_prob,Bovada_draw_implied_prob,Bovada_away_win_implied_prob,Caesars_home_win_implied_prob,Caesars_draw_implied_prob,Caesars_away_win_implied_prob,FanDuel_home_win_implied_prob,FanDuel_draw_implied_prob,FanDuel_away_win_implied_prob
0,Manchester United,Fulham,0.600091,0.225419,0.17449,0.617284,0.238095,0.208333,0.625,0.243902,...,0.20202,0.598802,0.243902,0.208333,0.628931,0.238095,0.190476,0.598802,0.25,0.192308
1,Ipswich Town,Liverpool,0.06209,0.193727,0.744183,0.125,0.173913,0.75188,0.117647,0.16,...,0.757576,0.117647,0.166667,0.763359,0.111111,0.181818,0.763359,0.117647,0.169492,0.763359
2,Arsenal,Wolverhampton Wanderers,0.810854,0.145313,0.043833,0.833333,0.137931,0.074074,0.862069,0.133333,...,0.066667,0.847458,0.142857,0.071429,0.854701,0.142857,0.058824,0.869565,0.133333,0.066667
3,Everton,Brighton and Hove Albion,0.384044,0.272188,0.343767,,,,0.377358,0.289855,...,0.37594,0.384615,0.294118,0.377358,0.37037,0.30303,0.384615,0.377358,0.30303,0.377358
4,Newcastle United,Southampton,0.672653,0.190404,0.136943,,,,0.757576,0.172414,...,0.125,0.75188,0.173913,0.125,0.763359,0.190476,0.111111,0.757576,0.175439,0.125
5,Nottingham Forest,Bournemouth,0.37284,0.277797,0.349363,0.408163,0.294118,0.357143,0.411523,0.289855,...,0.350877,0.416667,0.285714,0.350877,0.425532,0.285714,0.350877,0.416667,0.294118,0.344828
6,West Ham United,Aston Villa,0.381206,0.276716,0.342078,0.408163,0.277778,0.37037,0.420168,0.27027,...,0.37037,0.408163,0.273973,0.37037,0.416667,0.27027,0.377358,0.416667,0.277778,0.363636
7,Brentford,Crystal Palace,0.425576,0.26914,0.305285,0.416667,0.294118,0.350877,0.411523,0.294118,...,0.338983,0.416667,0.298507,0.338983,0.425532,0.294118,0.344828,0.416667,0.294118,0.338983
8,Chelsea,Manchester City,0.325115,0.237821,0.437063,0.25641,0.25,0.546448,0.253165,0.263158,...,0.534759,0.25974,0.253165,0.534759,0.25,0.25,0.555556,0.25641,0.25641,0.534759
9,Leicester City,Tottenham Hotspur,0.14362,0.212247,0.644132,0.208333,0.238095,0.617284,0.190476,0.222222,...,0.636943,0.192308,0.222222,0.636943,0.190476,0.238095,0.628931,0.181818,0.222222,0.636943


In [483]:
import numpy as np

# Define the stake
stake = 10

# Initialize an empty list to store the expected values
expected_values = []

# Mapping for outcome probabilities
outcome_prob_map = {
    'home_win': 'Home Win Probability',
    'draw': 'Draw Probability',
    'away_win': 'Away Win Probability'
}

# Loop through each bookmaker and calculate the expected values
for bookmaker in bookmakers:
    for outcome in ['home_win', 'draw', 'away_win']:
        prob_column = outcome_prob_map[outcome]
        odds_column = f'{bookmaker}_{outcome}_odds'
        
        # Debugging: Check if the odds column exists
        if odds_column not in merged_df.columns:
            print(f"Missing column: {odds_column}")
            continue
        
        # Calculate expected value with a stake of 10
        merged_df[f'{bookmaker}_{outcome}_EV'] = merged_df.apply(
            lambda row: stake * ((row[prob_column] * row[odds_column]) - (1 - row[prob_column])) if not pd.isnull(row[odds_column]) else np.nan,
            axis=1
        )
        
        expected_values.append(f'{bookmaker}_{outcome}_EV')

# Debugging: Check the final list of expected value columns created
print(f"Expected Value Columns Created: {expected_values}")

# Display the final DataFrame with the expected values
try:
    print(merged_df[['home_team', 'away_team'] + expected_values].head())
except KeyError as e:
    print(f"KeyError: {e}")
    missing_columns = [col for col in expected_values if col not in merged_df.columns]
    print(f"Missing Expected Value Columns: {missing_columns}")


Expected Value Columns Created: ['BetMGM_home_win_EV', 'BetMGM_draw_EV', 'BetMGM_away_win_EV', 'BetRivers_home_win_EV', 'BetRivers_draw_EV', 'BetRivers_away_win_EV', 'BetUS_home_win_EV', 'BetUS_draw_EV', 'BetUS_away_win_EV', 'Bovada_home_win_EV', 'Bovada_draw_EV', 'Bovada_away_win_EV', 'Caesars_home_win_EV', 'Caesars_draw_EV', 'Caesars_away_win_EV', 'FanDuel_home_win_EV', 'FanDuel_draw_EV', 'FanDuel_away_win_EV']
           home_team                 away_team  BetMGM_home_win_EV  \
0  Manchester United                    Fulham            5.722397   
1       Ipswich Town                 Liverpool           -4.411875   
2            Arsenal   Wolverhampton Wanderers            7.838786   
3            Everton  Brighton and Hove Albion                 NaN   
4   Newcastle United               Southampton                 NaN   

   BetMGM_draw_EV  BetMGM_away_win_EV  BetRivers_home_win_EV  \
0        1.721784            0.120396               5.602379   
1        3.076574            7.339

In [484]:
merged_df

Unnamed: 0,season,week,home_team,away_team,home_starters,away_starters,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,...,BetUS_away_win_EV,Bovada_home_win_EV,Bovada_draw_EV,Bovada_away_win_EV,Caesars_home_win_EV,Caesars_draw_EV,Caesars_away_win_EV,FanDuel_home_win_EV,FanDuel_draw_EV,FanDuel_away_win_EV
0,2425,1,Manchester United,Fulham,"[Onana, Dalot, Maguire, Evans, Martínez, Casem...","[Leno, Castagne, Diop, Bassey, Robinson, Lukic...",0.282,0.33,0.0,0.0,...,0.38213,6.022443,1.496365,0.120396,5.54237,1.721784,0.905599,6.022443,1.270946,0.818354
1,2425,1,Ipswich Town,Liverpool,"[Muric, Tuanzebe, Woolfenden, Greaves, Johnson...","[Alisson, Alexander-Arnold, Quansah, van Dijk,...",0.0,0.755,0.0,0.0,...,7.265038,-4.101424,3.560892,7.19062,-3.790972,2.592257,7.19062,-4.101424,3.367165,7.19062
2,2425,1,Arsenal,Wolverhampton Wanderers,"[Raya Martin, White, Saliba, Gabriel, Zinchenk...","[José Sá, Doherty, Mosquera, Toti, Ait Nouri, ...",0.954,0.02,0.0,0.0,...,-2.986711,7.676615,1.625044,-3.425042,7.59553,1.625044,-2.11005,7.433359,2.351609,-2.986711
3,2425,1,Everton,Brighton and Hove Albion,"[Pickford, Young, Tarkowski, O'Brien, Mykolenk...","[Steele, Veltman, van Hecke, Dunk, Barco, Wief...",0.302,0.329,0.0,0.0,...,2.581887,3.825601,1.976278,2.54751,4.209645,1.70409,2.375626,4.017623,1.70409,2.54751
4,2425,1,Newcastle United,Southampton,"[Pope, Livramento, Schär, Burn, Lewis Hall, Lo...","[McCarthy, Harwood-Bellis, Stephens, Bednarek,...",0.432,0.0,0.0,0.0,...,2.324896,5.672807,2.852272,2.324896,5.538277,1.900252,3.694329,5.605542,2.75707,2.324896
5,2425,1,Nottingham Forest,Bournemouth,"[Sels, Williams, Milenkovic, Murillo, Aina, Sa...","[Neto, Smith, Zabarnyi, Senesi, Kerkez, Christ...",0.188,0.37,0.0,0.0,...,3.450487,2.676545,2.500872,3.450487,2.490125,2.500872,3.450487,2.676545,2.223075,3.625168
6,2425,1,West Ham United,Aston Villa,"[Areola, Wan-Bissaka, Todibo, Kilman, Emerson,...","[Martinez, Cash, Konsa Ngoyo, Torres, Digne, B...",0.176,0.565,0.0,0.0,...,2.656886,3.151611,2.867289,2.656886,2.961008,3.005647,2.485847,2.961008,2.728931,2.827925
7,2425,1,Brentford,Crystal Palace,"[Flekken, Roerslev, Collins, Pinnock, Ajer, Je...","[Henderson, Richards, Andersen, Riad, Muñoz, H...",0.319,0.089,0.0,0.0,...,2.058739,4.469571,1.707583,2.058739,4.256784,1.842153,1.906097,4.469571,1.842153,2.058739
8,2425,1,Chelsea,Manchester City,"[Sanchez, Gusto, Fofana, Colwill, Cucurella, F...","[Ederson, Lewis, Akanji, Rúben Dias, Gvardiol,...",0.543,0.772,0.0,0.0,...,2.543718,5.768085,1.772164,2.543718,6.255758,1.891075,2.237774,5.930643,1.653253,2.543718
9,2425,1,Leicester City,Tottenham Hotspur,"[Hermansen, Justin, Faes, Okoli, Kristiansen, ...","[Vicario, Porro, Romero, van de Ven, Destiny U...",0.0,0.528,0.0,0.0,...,6.554204,-1.095551,1.673607,6.554204,-1.023741,1.036865,6.683031,-0.66469,1.673607,6.554204


In [485]:
import numpy as np

# Define the stake
stake = 10

# Initialize an empty list to store the expected values
expected_values = []

# Mapping for outcome probabilities
outcome_prob_map = {
    'home_win': 'Home Win Probability',
    'draw': 'Draw Probability',
    'away_win': 'Away Win Probability'
}

# Debugging: Print the columns in merged_df
print("Available columns in merged_df:")
print(merged_df.columns.tolist())

# Loop through each bookmaker and calculate the expected values
for bookmaker in bookmakers:
    print(f"Processing bookmaker: {bookmaker}")
    for outcome in ['home_win', 'draw', 'away_win']:
        prob_column = outcome_prob_map[outcome]
        odds_column = f'{bookmaker}_{outcome}_odds'
        
        # Debugging: Check if the odds column exists
        if odds_column not in merged_df.columns:
            print(f"Missing column: {odds_column}")
            continue
        
        print(f"Calculating EV for {bookmaker} - {outcome}")
        
        # Calculate expected value with a stake of 10
        merged_df[f'{bookmaker}_{outcome}_EV'] = merged_df.apply(
            lambda row: stake * ((row[prob_column] * row[odds_column]) - (1 - row[prob_column])) if not pd.isnull(row[odds_column]) else np.nan,
            axis=1
        )
        
        expected_values.append(f'{bookmaker}_{outcome}_EV')

# Debugging: Check the final list of expected value columns created
print(f"Expected Value Columns Created: {expected_values}")

# Display the final DataFrame with the expected values
try:
    print(merged_df[['home_team', 'away_team'] + expected_values].head())
except KeyError as e:
    print(f"KeyError: {e}")
    missing_columns = [col for col in expected_values if col not in merged_df.columns]
    print(f"Missing Expected Value Columns: {missing_columns}")


Available columns in merged_df:
['season', 'week', 'home_team', 'away_team', 'home_starters', 'away_starters', 'home_team_strength', 'away_team_strength', 'home_xG_to_date', 'away_xG_to_date', 'home_xG_against_to_date', 'away_xG_against_to_date', 'home_goals_scored', 'away_goals_scored', 'home_goals_scored_to_date', 'away_goals_scored_to_date', 'home_goals_conceded_to_date', 'away_goals_conceded_to_date', 'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form', 'Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds', 'Pinnacle Closing Away Win Odds', 'home_team_elo', 'away_team_elo', 'Home Win Probability', 'Draw Probability', 'Away Win Probability', 'BetMGM_away_win_odds', 'BetMGM_draw_odds', 'BetMGM_home_win_odds', 'BetOnline.ag_away_win_odds', 'BetOnline.ag_draw_odds', 'BetOnline.ag_home_win_odds', 'BetRivers_away_win_odds', 'BetRivers_draw_odds', 'BetRivers_home_win_odds', 'BetUS_away_win_odds', 'BetUS_draw_odds', 'BetUS_home_win_odds', 'Bovada_away_win_odds',

In [486]:
# Display the final DataFrame with the expected values
print(merged_df[['home_team', 'away_team'] + expected_values].head())

           home_team                 away_team  BetMGM_home_win_EV  \
0  Manchester United                    Fulham            5.722397   
1       Ipswich Town                 Liverpool           -4.411875   
2            Arsenal   Wolverhampton Wanderers            7.838786   
3            Everton  Brighton and Hove Albion                 NaN   
4   Newcastle United               Southampton                 NaN   

   BetMGM_draw_EV  BetMGM_away_win_EV  BetRivers_home_win_EV  \
0        1.721784            0.120396               5.602379   
1        3.076574            7.339457              -4.101424   
2        1.988326           -3.644207               7.514444   
3             NaN                 NaN               4.017623   
4             NaN                 NaN               5.605542   

   BetRivers_draw_EV  BetRivers_away_win_EV  BetUS_home_win_EV  BetUS_draw_EV  \
0           1.496365               1.341823           5.842415       1.609075   
1           4.045210            

In [487]:
for bookmaker in bookmakers:
    for outcome in ['home_win', 'draw', 'away_win']:
        prob_column = outcome_prob_map[outcome]
        odds_column = f'{bookmaker}_{outcome}_odds'
        ev_column = f'{bookmaker}_{outcome}_EV'
        
        # Calculate the Kelly stake
        if ev_column in merged_df.columns:
            merged_df[f'{bookmaker}_{outcome}_Kelly'] = merged_df.apply(
                lambda row: 0.5 * max(0, (row[prob_column] * (row[odds_column] - 1) - (1 - row[prob_column])) / (row[odds_column] - 1))
                            if row[ev_column] > 0 and not pd.isnull(row[odds_column]) else 0,
                axis=1
            )

import pandas as pd

# Define the stake
bankroll = 1000
kelly_factor = 0.5  # Half-Kelly criterion

# Initialize a list to store the final bets
final_bets = []

# Iterate over the rows of the DataFrame
for index, row in merged_df.iterrows():
    best_bets = {}

    # Find the best odds and corresponding bookmaker for each outcome
    for outcome in ['home_win', 'draw', 'away_win']:
        best_odds = 0
        best_bookmaker = None
        total_kelly_stake = 0

        # Find the bookmaker with the best odds for this outcome
        for bookmaker in bookmakers:
            odds_column = f'{bookmaker}_{outcome}_odds'
            kelly_column = f'{bookmaker}_{outcome}_Kelly'

            if odds_column in row.index and not pd.isnull(row[odds_column]):
                if row[odds_column] > best_odds:
                    best_odds = row[odds_column]
                    best_bookmaker = bookmaker
                    total_kelly_stake = row.get(kelly_column, 0)

        # Calculate the stake if there is a positive Kelly stake
        if best_bookmaker and total_kelly_stake > 0:
            stake = bankroll * total_kelly_stake * kelly_factor  # Apply 0.5 Kelly criterion

            best_bets[outcome] = {
                'odds': best_odds,
                'bookmaker': best_bookmaker,
                'kelly_stake': stake
            }

    # Add the bets to the final list
    for outcome, bet_info in best_bets.items():
        if bet_info['kelly_stake'] > 0:
            final_bets.append({
                'home_team': row['home_team'],
                'away_team': row['away_team'],
                'bookmaker': bet_info['bookmaker'],
                'outcome': outcome,
                'Stake': bet_info['kelly_stake']
            })

# Convert the final bets into a DataFrame
final_bets_df = pd.DataFrame(final_bets)

# Ensure total bets don't exceed the bankroll
if final_bets_df['Stake'].sum() > bankroll:
    scaling_factor = bankroll / final_bets_df['Stake'].sum()
    final_bets_df['Stake'] = final_bets_df['Stake'] * scaling_factor

# Display the final bets
print(final_bets_df)

# Calculate the total amount being bet with 0.5 Kelly and bankroll limit
total_stake = final_bets_df['Stake'].sum()
print(f"Total amount being bet with 0.5 Kelly and bankroll constraint: {total_stake:.2f}")


            home_team                 away_team  bookmaker   outcome  \
0   Manchester United                    Fulham     Bovada  home_win   
1        Ipswich Town                 Liverpool  BetRivers      draw   
2             Arsenal   Wolverhampton Wanderers      BetUS      draw   
3             Everton  Brighton and Hove Albion    Caesars  home_win   
4    Newcastle United               Southampton  BetRivers      draw   
5    Newcastle United               Southampton    Caesars  away_win   
6   Nottingham Forest               Bournemouth    FanDuel  away_win   
7     West Ham United               Aston Villa  BetRivers      draw   
8           Brentford            Crystal Palace  BetRivers  home_win   
9             Chelsea           Manchester City    Caesars  home_win   
10     Leicester City         Tottenham Hotspur     BetMGM  away_win   

        Stake  
0    0.803282  
1   10.037806  
2    7.453702  
3    5.429418  
4    5.434551  
5    7.265300  
6    1.730728  
7    2.

In [488]:
final_bets_df

Unnamed: 0,home_team,away_team,bookmaker,outcome,Stake
0,Manchester United,Fulham,Bovada,home_win,0.803282
1,Ipswich Town,Liverpool,BetRivers,draw,10.037806
2,Arsenal,Wolverhampton Wanderers,BetUS,draw,7.453702
3,Everton,Brighton and Hove Albion,Caesars,home_win,5.429418
4,Newcastle United,Southampton,BetRivers,draw,5.434551
5,Newcastle United,Southampton,Caesars,away_win,7.2653
6,Nottingham Forest,Bournemouth,FanDuel,away_win,1.730728
7,West Ham United,Aston Villa,BetRivers,draw,2.208219
8,Brentford,Crystal Palace,BetRivers,home_win,5.970066
9,Chelsea,Manchester City,Caesars,home_win,25.038386


In [490]:
# Initialize a new DataFrame to store the details for each bet
bet_details_df = pd.DataFrame()

# Loop through each row in the final_bets_df
for _, row in final_bets_df.iterrows():
    # Determine the corresponding columns for EV, implied probability, and model probability
    ev_column = f"{row['bookmaker']}_{row['outcome']}_EV"
    implied_prob_column = f"{row['bookmaker']}_{row['outcome']}_implied_prob"
    
    # Determine the appropriate probability column based on the outcome
    if row['outcome'] == 'home_win':
        model_prob_column = 'Home Win Probability'
    elif row['outcome'] == 'draw':
        model_prob_column = 'Draw Probability'
    elif row['outcome'] == 'away_win':
        model_prob_column = 'Away Win Probability'

    # Create a dictionary with the bet details
    bet_details = {
        'home_team': row['home_team'],
        'away_team': row['away_team'],
        'bookmaker': row['bookmaker'],
        'outcome': row['outcome'],
        'Expected Value': merged_df.loc[merged_df['home_team'] == row['home_team'], ev_column].values[0],
        'Implied Probability': merged_df.loc[merged_df['home_team'] == row['home_team'], implied_prob_column].values[0],
        'Model Probability': merged_df.loc[merged_df['home_team'] == row['home_team'], model_prob_column].values[0]
    }
    
    # Add the dictionary as a row to the DataFrame
    bet_details_df = pd.concat([bet_details_df, pd.DataFrame([bet_details])], ignore_index=True)

# Display the new DataFrame
bet_details_df


Unnamed: 0,home_team,away_team,bookmaker,outcome,Expected Value,Implied Probability,Model Probability
0,Manchester United,Fulham,Bovada,home_win,6.022443,0.598802,0.600091
1,Ipswich Town,Liverpool,BetRivers,draw,4.04521,0.16,0.193727
2,Arsenal,Wolverhampton Wanderers,BetUS,draw,3.659426,0.119048,0.145313
3,Everton,Brighton and Hove Albion,Caesars,home_win,4.209645,0.37037,0.384044
4,Newcastle United,Southampton,BetRivers,draw,2.947474,0.172414,0.190404
5,Newcastle United,Southampton,Caesars,away_win,3.694329,0.111111,0.136943
6,Nottingham Forest,Bournemouth,FanDuel,away_win,3.625168,0.344828,0.349363
7,West Ham United,Aston Villa,BetRivers,draw,3.005647,0.27027,0.276716
8,Brentford,Crystal Palace,BetRivers,home_win,4.597244,0.411523,0.425576
9,Chelsea,Manchester City,Caesars,home_win,6.255758,0.25,0.325115
