In [None]:
# TODO

# add assertion that name of the team from fixtures found in the team_names.xlsx

# 0. write a readme
# 1. add some lower rated teams to the regression
# 2. include international competitions results in elo calc
# 3. graph odds to win over each round
# 4. try to obtain elo from odds
# 5. get/scrape outrights odds [check the odds api]
# 6. scrape opta power rankings
# 7. regression opta power rankings to elo - unhardcode

# european leagues with seasons not ending around May: (Finland, Ireland), Norway, Sweden
# american leagues -||-: Brazil, Canada, Ecuador, USA, Uruguay
# asian leagues -||-: Japan, Korea et al.

In [None]:
import copy
from datetime import datetime
import json
import math
import os
from pathlib import Path
import random
from typing import Optional

from dotenv import load_dotenv
import matplotlib.pyplot as plt
import pandas as pd
import requests
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

In [None]:
load_dotenv()

API_TOKEN = os.getenv('X-RapidAPI-Key')
HFA = 60  # home field advantage
K_FACTOR = 20 # Elo rating system K-factor

In [None]:
def download_elo_data(date=None) -> None:
    """Note: apparently only European clubs are included."""
    if date is None:
        date = datetime.today().strftime('%Y-%m-%d')
    df = pd.read_csv(f"http://api.clubelo.com/{date}")
    Path("data/elo").mkdir(parents=True, exist_ok=True)
    df = df[['Rank', 'Club', 'Country', 'Level', 'Elo']]
    date = date.replace('-', '')
    df.to_csv(f"data/elo/{date}.csv", index=False)

# download_elo_data('2023-07-20') # ekstraklasa
# download_elo_data('2024-03-29') # allsvenskan
# download_elo_data('2024-03-30') # eliteserien

download_elo_data()

In [None]:
def api_get_leagues() -> None:
    url = "https://api-football-v1.p.rapidapi.com/v3/leagues"

    params = {"current": "true"}

    headers = {
        "X-RapidAPI-Key": API_TOKEN,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=params)
    print(response.status_code)

    if response.json()['paging']['total'] != 1:
        raise Exception("Error: multiple pages of leagues")

    Path("data/api").mkdir(parents=True, exist_ok=True)
    with open("data/api/leagues.json", "w") as f:
        json.dump(response.json(), f)

api_get_leagues()

In [None]:
def api_get_fixtures_for_league(league_id: str, season: str) -> None:
    url = "https://api-football-v1.p.rapidapi.com/v3/fixtures"
    params = {"league": league_id, "season": season}

    headers = {
        "X-RapidAPI-Key": API_TOKEN,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.json()['results'] == 0:
        raise Exception("No results found.")

    if response.json()['paging']['total'] != 1:
        raise Exception("Error: multiple pages of leagues")

    Path("data/api").mkdir(parents=True, exist_ok=True)
    with open(f"data/api/fixtures_{league_id}_{season}.json", "w") as f:
        json.dump(response.json(), f)

api_get_fixtures_for_league("71", "2025")

In [None]:
def api_get_standings_for_league() -> None:
    pass

In [None]:
def find_league_id(country_code: str, league_name: str) -> str:
    with open("data/api/leagues.json", "r") as f:
        leagues = json.load(f)
        for league in leagues['response']:
            if league['country']['code'] == country_code and league['league']['type'] == 'League' and league['league']['name'] == league_name:
                return league['league']['id']

assert find_league_id("BR", "Serie A") == 71
assert find_league_id("NO", "Eliteserien") == 103
assert find_league_id("PL", "Ekstraklasa") == 106
assert find_league_id("SE", "Allsvenskan") == 113

In [None]:
find_league_id("GB-ENG", "Premier League")

In [None]:
def find_latest_elo_file() -> str:
    elo_files = os.listdir("data/elo")
    return f'data/elo/{sorted(elo_files)[-1]}'

In [None]:
def get_team_names_from_elo(elo_country_code: str) -> None:
    df = pd.read_csv(find_latest_elo_file())
    df = df[(df['Country'] == elo_country_code) & (df['Level'] == 1)]
    names = sorted(df['Club'].tolist())
    df = pd.DataFrame(names)
    df.to_excel('tmp_team_names_elo.xlsx', index=False)

# get_team_names_from_elo('POL')

In [None]:
def get_team_names_from_api_dump(path: str) -> None:
    with open(path, "r") as f:
        fixtures = json.load(f)
    names = sorted(set([fixture['teams']['home']['name'] for fixture in fixtures['response']]))
    df = pd.DataFrame(names)
    df.to_excel('tmp_team_names_api.xlsx', index=False)

get_team_names_from_api_dump('data/api/fixtures_71_2025.json')

In [None]:
def get_api_teams_and_elo_from_clubelo(date: str, country_code: str) -> pd.DataFrame:
    date = date.replace('-', '')
    elo_data = pd.read_csv(f"data/elo/{date}.csv")

    elo_data = elo_data[(elo_data['Country'] == country_code) & (elo_data['Level'] == 1)]

    elo_data['Elo'] = elo_data['Elo'].apply(round)

    try:
        team_map_df = pd.read_excel('team_names.xlsx')
        team_map = dict()
        for _, row in team_map_df.iterrows():
            team_map[row['ELO_name']] = row['API_name']

        elo_data['Club'] = elo_data['Club'].apply(lambda x: team_map[x])
    except Exception as e:
        print(f"Warning: could not map team names: {e}")

    elo_data.reset_index(drop=True, inplace=True)

    return elo_data[['Club', 'Elo']]

# elo_df = get_api_teams_and_elo_from_clubelo('2023-09-20', 'POL')
# elo_df.head()

In [None]:
df = pd.read_csv('data/brasil_seriea_opta_20251030.csv')



In [None]:
def get_custom_api_teams_and_elo_from_opta(country_code: str) -> pd.DataFrame:
    if country_code == 'BR':
        power_df = pd.read_csv('data/brasil_seriea_opta_20251030.csv')
    else:
        raise Exception("Country not supported.")
    
    team_map_df = pd.read_excel('team_names.xlsx')
    team_map_df = team_map_df[team_map_df['Country_code'] == country_code]
    team_map_df = team_map_df[['API_name', 'Opta_name']]

    df = pd.merge(power_df, team_map_df, left_on='Team', right_on='Opta_name', how='inner')

    coef = 24.236167012174086
    intercept = -368.30096053134594

    df['Elo'] = df['Rating'] * coef + intercept

    df = df[['API_name', 'Elo']]

    df = df.rename(columns={'API_name': 'Club'})

    df['Elo'] = df['Elo'].round().astype(int)
    
    return df

elo_df = get_custom_api_teams_and_elo_from_opta('BR')
elo_df.head(20)

In [None]:
def build_historical_standings_table_after_n_rounds(league_id: str, season: str, country_code_elo: Optional[str], country_code_api: str, after_round: int, elo_date: Optional[str], modify_elo: bool) -> tuple[pd.DataFrame, pd.DataFrame]:

    if country_code_elo is not None:
        elo_df = get_api_teams_and_elo_from_clubelo(elo_date, country_code_elo)
    else:
        elo_df = get_custom_api_teams_and_elo_from_opta(country_code_api)

    with open(f"data/api/fixtures_{league_id}_{season}.json", "r") as f:
        fixtures = json.load(f)['response']

    elo_dict = {row['Club']: row['Elo'] for _, row in elo_df.iterrows()}
    points_dict = {row['Club']: 0 for _, row in elo_df.iterrows()}
    games_played_dict = {row['Club']: 0 for _, row in elo_df.iterrows()}

    for fixture in fixtures:
        round_str = int(fixture['league']['round'].split(' ')[-1])
        if (round_str > after_round) or (fixture['fixture']['status']['long'] != 'Match Finished'):
            continue

        home_team = fixture['teams']['home']['name']
        away_team = fixture['teams']['away']['name']

        home_goals = fixture['goals']['home']
        away_goals = fixture['goals']['away']

        if home_goals > away_goals:
            points_dict[home_team] += 3
        elif home_goals < away_goals:
            points_dict[away_team] += 3
        else:
            points_dict[home_team] += 1
            points_dict[away_team] += 1
        
        if modify_elo:
            home_elo = elo_dict[home_team]
            away_elo = elo_dict[away_team]

            elo_difference = home_elo - away_elo + HFA

            if home_goals > away_goals:
                elo_delta = (1 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR
            elif home_goals < away_goals:
                elo_delta = (0 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR
            else:
                elo_delta = (1 / 2 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR

            elo_dict[home_team] += elo_delta
            elo_dict[away_team] -= elo_delta

        games_played_dict[home_team] += 1
        games_played_dict[away_team] += 1

    points_df = pd.DataFrame(points_dict.items(), columns=['Club', 'Points'])
    elo_df = pd.DataFrame(elo_dict.items(), columns=['Club', 'Elo'])
    games_played_df = pd.DataFrame(games_played_dict.items(), columns=['Club', 'Games played'])
    
    standings_df = pd.merge(elo_df, points_df, on='Club', how='inner')
    standings_df = pd.merge(standings_df, games_played_df, on='Club', how='inner')

    standings_df = standings_df.sort_values(by=['Points'], ascending=False).reset_index(drop=True)
    standings_df.index += 1
    
    return standings_df

# elo_df, standings_df = build_historical_elo_and_standings_table_after_n_rounds('106', '2023', 'POL', 'PL', 8, '2023-09-20', True)
# print(elo_df.head())
# print(standings_df.head())

In [None]:
def simulate_season(league_id: str, season: str, after_round: int, standings_df: pd.DataFrame, reverse: bool, modify_elo_in_sim: bool) -> pd.DataFrame:
    with open(f"data/api/fixtures_{league_id}_{season}.json", "r") as f:
        fixtures = json.load(f)['response']

    elo_dict = {row['Club']: row['Elo'] for _, row in standings_df.iterrows()}
    points_dict = {row['Club']: row['Points'] for _, row in standings_df.iterrows()}
    games_played_dict = {row['Club']: row['Games played'] for _, row in standings_df.iterrows()}

    for fixture in fixtures:
        round_str = int(fixture['league']['round'].split(' ')[-1])
        if (round_str <= after_round) and (fixture['fixture']['status']['long'] == 'Match Finished'):
            continue

        home_team = fixture['teams']['home']['name']
        away_team = fixture['teams']['away']['name']

        home_elo = elo_dict[home_team]
        away_elo = elo_dict[away_team]

        elo_difference = home_elo - away_elo + HFA

        draw_prob = max(0, 0.3 - abs(elo_difference) / 2000)
        home_win_prob = max(0, 1 / (1 + math.pow(10, -elo_difference/400)) - draw_prob / 2)
        away_win_prob = max(0, 1 - home_win_prob - draw_prob)

        result = random.choices(['home_win', 'away_win', 'draw'], [home_win_prob, away_win_prob, draw_prob])[0]

        if result == 'home_win':
            points_dict[home_team] += 3
        elif result == 'away_win':
            points_dict[away_team] += 3
        elif result == 'draw':
            points_dict[home_team] += 1
            points_dict[away_team] += 1

        if modify_elo_in_sim:
            if result == 'home_win':
                elo_delta = (1 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR
            elif result == 'away_win':
                elo_delta = (0 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR
            else:
                elo_delta = (1 / 2 - 1 / (1 + math.pow(10, -elo_difference/400))) * K_FACTOR

            elo_dict[home_team] += elo_delta
            elo_dict[away_team] -= elo_delta


        games_played_dict[home_team] += 1
        games_played_dict[away_team] += 1
        
    points_df = pd.DataFrame(points_dict.items(), columns=['Club', 'Points'])
    elo_df = pd.DataFrame(elo_dict.items(), columns=['Club', 'Elo'])
    games_played_dict = pd.DataFrame(games_played_dict.items(), columns=['Club', 'Games played'])

    season_standings_df = pd.merge(points_df, elo_df, on='Club', how='inner')
    season_standings_df = pd.merge(season_standings_df, games_played_dict, on='Club', how='inner')

    season_standings_df = season_standings_df.sort_values(by=['Points'], ascending=reverse)

    return season_standings_df

# elo_df = get_api_teams_and_elo('2023-09-20', 'POL')
# standings_df = build_historical_standings_table_after_n_rounds('106', '2023', 'POL', 'PL', 8, '2023-09-20', True)
# season_df = simulate_season('106', '2023', 8, elo_df, standings_df)
# season_df.head()

In [None]:
def run_multiple_sims(league_id: str, season: str, elo_country_code: Optional[str], api_country_code: str, after_round: int, elo_date: Optional[str], number_of_sims: int, number_of_winning_places: int, reverse: bool, modify_elo_in_sim: bool, modify_elo_retro: bool) -> pd.DataFrame:
    standings_df = build_historical_standings_table_after_n_rounds(league_id, season, elo_country_code, api_country_code, after_round, elo_date, modify_elo_retro)
    print(standings_df)

    winners = dict()
    number_of_successful_sims = 0

    for _ in tqdm(range(number_of_sims)):
        winners_df = simulate_season(league_id, season, after_round, standings_df, reverse, modify_elo_in_sim)

        if winners_df.iloc[number_of_winning_places - 1]['Points'] == winners_df.iloc[number_of_winning_places]['Points']:
            continue
        number_of_successful_sims += 1
        for i in range(number_of_winning_places):
            try:
                winners[winners_df.iloc[i]['Club']] += 1
            except KeyError:
                winners[winners_df.iloc[i]['Club']] = 1

    df = pd.DataFrame(list(winners.items()), columns=['Club', 'Wins'])
    df['% winrate'] = round(df['Wins'] / number_of_successful_sims * 100)
    df['Expected odds'] = round(number_of_successful_sims / df['Wins'], 2)
    df = df.sort_values(by=['Wins'], ascending=False).reset_index(drop=True)
    df.index += 1
    print(f'{number_of_successful_sims} simulations')
    print(f'{number_of_winning_places} winning places')
    return df

In [None]:
config = 'BR'

if config == 'NO':
    league_id = '103'
    season = '2024'
    elo_country_code = 'NOR'
    api_country_code = 'NO'
    after_round = 9
    elo_date = '2024-03-30'
    number_of_sims = 1000
    number_of_winning_places = 1
    reverse = False
    modify_elo_in_sim = True
    modify_elo_retro = True
    
elif config == 'SE':
    league_id = '113'
    season = '2024'
    elo_country_code = 'SWE'
    api_country_code = 'SE'
    after_round = 10
    elo_date = '2024-03-29'
    number_of_sims = 1000
    number_of_winning_places = 1
    reverse = True
    modify_elo_in_sim = True
    modify_elo_retro = True

elif config == 'BR':
    league_id = '71'
    season = '2024'
    elo_country_code = None
    api_country_code = 'BR'
    after_round = 6
    elo_date = None
    number_of_sims = 10000
    number_of_winning_places = 4
    reverse = True
    modify_elo_in_sim = True
    modify_elo_retro = False

elif config == 'PL':
    league_id = '106'
    season = '2023'
    elo_country_code = 'POL'
    api_country_code = 'PL'
    after_round = 12
    elo_date = '2023-09-20'
    number_of_sims = 10000
    number_of_winning_places = 3
    reverse = True
    modify_elo_in_sim = True
    modify_elo_retro = True

results = run_multiple_sims(league_id, season, elo_country_code, api_country_code, after_round, elo_date, number_of_sims, number_of_winning_places, reverse, modify_elo_in_sim, modify_elo_retro)
Path('data/results').mkdir(parents=True, exist_ok=True)
if reverse:
    results.to_excel(f'data/results/results_{api_country_code}_{number_of_sims}sims_bottom{number_of_winning_places}.xlsx', index=False)
else:
    results.to_excel(f'data/results/results_{api_country_code}_{number_of_sims}sims_top_{number_of_winning_places}.xlsx', index=False)
results