In [15]:
# TODO
# get elo from the start of the season
# update elo after each game with k = 40
# get/scrape outrights odds

# european leagues with seasons not ending around May: Finland, Ireland, Norway, Sweden
# american leagues -||-: Brazil, Canada, Ecuador, USA, Uruguay
# asian leagues -||-: Japan, Korea et al.

In [52]:
import copy
from datetime import datetime
import json
import math
import os
from pathlib import Path
import random

from dotenv import load_dotenv
import pandas as pd
import requests
from tqdm import tqdm

In [45]:
load_dotenv()

API_TOKEN = os.getenv('X-RapidAPI-Key')
HFA = 50
K_FACTOR = 40

In [79]:
def download_elo_data(date=None) -> None:
    if date is None:
        date = datetime.today().strftime('%Y-%m-%d')
    df = pd.read_csv(f"http://api.clubelo.com/{date}")
    Path("data/elo").mkdir(parents=True, exist_ok=True)
    df = df[['Rank', 'Club', 'Country', 'Level', 'Elo']]
    date = date.replace('-', '')
    df.to_csv(f"data/elo/{date}.csv", index=False)

# download_elo_data('2023-07-20')

In [19]:
def api_get_leagues() -> None:
    url = "https://api-football-v1.p.rapidapi.com/v3/leagues"

    params = {"current": "true"}

    headers = {
        "X-RapidAPI-Key": API_TOKEN,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=params)
    print(response.status_code)

    if response.json()['paging']['total'] != 1:
        raise Exception("Error: multiple pages of leagues")

    Path("data/api").mkdir(parents=True, exist_ok=True)
    with open("data/api/leagues.json", "w") as f:
        json.dump(response.json(), f)

# api_get_leagues()

In [80]:
def api_get_fixtures_for_league(league_id: str, season: str) -> None:
    url = "https://api-football-v1.p.rapidapi.com/v3/fixtures"
    params = {"league": league_id, "season": season}

    headers = {
        "X-RapidAPI-Key": API_TOKEN,
        "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.json()['results'] == 0:
        raise Exception("No results found.")

    if response.json()['paging']['total'] != 1:
        raise Exception("Error: multiple pages of leagues")

    Path("data/api").mkdir(parents=True, exist_ok=True)
    with open(f"data/api/fixtures_{league_id}_{season}.json", "w") as f:
        json.dump(response.json(), f)

# api_get_fixtures_for_league("103", "2024")

In [21]:
def api_get_standings_for_league() -> None:
    pass

In [22]:
def find_league_id(country_code: str, league_name: str) -> str:
    with open("data/api/leagues.json", "r") as f:
        leagues = json.load(f)
        for league in leagues['response']:
            if league['country']['code'] == country_code and league['league']['type'] == 'League' and league['league']['name'] == league_name:
                return league['league']['id']

assert find_league_id("NO", "Eliteserien") == 103
assert find_league_id("PL", "Ekstraklasa") == 106
assert find_league_id("SE", "Allsvenskan") == 113

In [23]:
def find_latest_elo_file() -> str:
    elo_files = os.listdir("data/elo")
    return f'data/elo/{sorted(elo_files)[-1]}'

In [24]:
def get_team_names_from_elo(elo_country_code: str) -> None:
    df = pd.read_csv(find_latest_elo_file())
    df = df[(df['Country'] == elo_country_code) & (df['Level'] == 1)]
    names = sorted(df['Club'].tolist())
    df = pd.DataFrame(names)
    df.to_excel('tmp_team_names_elo.xlsx', index=False)

# get_team_names_from_elo('POL')

In [25]:
def simulate_results() -> None:
    pass

In [26]:
def get_team_names_from_api_dump(path: str) -> None:
    with open(path, "r") as f:
        fixtures = json.load(f)
    names = sorted(set([fixture['teams']['home']['name'] for fixture in fixtures['response']]))
    df = pd.DataFrame(names)
    df.to_excel('tmp_team_names_api.xlsx', index=False)

# get_team_names_from_api_dump('data/api/fixtures_106_2023.json')

In [82]:
def get_api_teams_and_elo(date: str, country_code: str) -> pd.DataFrame:
    date = date.replace('-', '')
    elo_data = pd.read_csv(f"data/elo/{date}.csv")

    elo_data = elo_data[(elo_data['Country'] == country_code) & (elo_data['Level'] == 1)]

    team_map_df = pd.read_excel('team_names.xlsx')
    team_map = dict()
    for i, row in team_map_df.iterrows():
        team_map[row['ELO_name']] = row['API_name']

    elo_data['Club'] = elo_data['Club'].apply(lambda x: team_map[x])

    return elo_data[['Club', 'Elo']]

# elo_df = get_api_teams_and_elo('2023-09-20', 'POL')
# elo_df.head()

In [42]:
def build_historical_standings_table_after_n_rounds(league_id: str, season: str, country_code_elo: str, country_code_api: str, after_round: int, elo_date: str) -> pd.DataFrame:

    elo_df = get_api_teams_and_elo(elo_date, country_code_elo)

    with open(f"data/api/fixtures_{league_id}_{season}.json", "r") as f:
        fixtures = json.load(f)['response']

    standings_df = elo_df.copy()[['Club']].reset_index(drop=True)
    standings_df['Points'] = 0

    for fixture in fixtures:
        round_str = int(fixture['league']['round'].split(' ')[-1])
        if round_str > after_round:
            continue

        home_team = fixture['teams']['home']['name']
        away_team = fixture['teams']['away']['name']

        home_goals = fixture['goals']['home']
        away_goals = fixture['goals']['away']

        if home_goals > away_goals:
            standings_df.loc[standings_df['Club'] == home_team, 'Points'] += 3
        elif home_goals < away_goals:
            standings_df.loc[standings_df['Club'] == away_team, 'Points'] += 3
        else:
            standings_df.loc[standings_df['Club'] == home_team, 'Points'] += 1
            standings_df.loc[standings_df['Club'] == away_team, 'Points'] += 1

    standings_df = standings_df.sort_values(by=['Points'], ascending=False).reset_index(drop=True)

    return standings_df

# standings_df = build_historical_standings_table_after_n_rounds('106', '2023', 'POL', 'PL', 8, '2023-09-20')
# standings_df.head()

In [74]:
def simulate_season(league_id: str, season: str, after_round: int, elo_df: pd.DataFrame, standings_df: pd.DataFrame) -> pd.DataFrame:
    with open(f"data/api/fixtures_{league_id}_{season}.json", "r") as f:
        fixtures = json.load(f)['response']

    season_standings_df = standings_df.copy()

    for fixture in fixtures:
        round_str = int(fixture['league']['round'].split(' ')[-1])
        if round_str <= after_round:
            continue

        home_team = fixture['teams']['home']['name']
        away_team = fixture['teams']['away']['name']

        home_elo = elo_df[elo_df['Club'] == home_team]['Elo'].values[0]
        away_elo = elo_df[elo_df['Club'] == away_team]['Elo'].values[0]

        elo_difference = home_elo - away_elo + HFA

        home_win_prob = 1 / (1 + math.pow(10, -elo_difference/400)) * 0.75
        away_win_prob = 0.75 - home_win_prob
        tie_prob = 0.25

        result = random.choices(['home_win', 'away_win', 'draw'], [home_win_prob, away_win_prob, tie_prob])[0]

        if result == 'home_win':
            season_standings_df.loc[season_standings_df['Club'] == home_team, 'Points'] += 3
        elif result == 'away_win':
            season_standings_df.loc[season_standings_df['Club'] == away_team, 'Points'] += 3
        elif result == 'draw':
            season_standings_df.loc[season_standings_df['Club'] == home_team, 'Points'] += 1
            season_standings_df.loc[season_standings_df['Club'] == away_team, 'Points'] += 1

    season_standings_df = season_standings_df.sort_values(by=['Points'], ascending=False).reset_index(drop=True)

    return season_standings_df

# elo_df = get_api_teams_and_elo('2023-09-20', 'POL')
# standings_df = build_historical_standings_table_after_n_rounds('106', '2023', 'POL', 'PL', 8, '2023-09-20')
# season_df = simulate_season('106', '2023', 8, elo_df, standings_df)
# season_df.head()

In [77]:
def run_multiple_sims(league_id: str, season: str, after_round: int, elo_df: pd.DataFrame, standings_df: pd.DataFrame, number_of_sims: int, number_of_winning_places: int) -> pd.DataFrame:
    winners = dict()

    number_of_successful_sims = 0

    for _ in tqdm(range(number_of_sims)):
        winners_df = simulate_season(league_id, season, after_round, elo_df, standings_df)
        if winners_df.iloc[number_of_winning_places - 1]['Points'] == winners_df.iloc[number_of_winning_places]['Points']:
            continue
        number_of_successful_sims += 1
        for i in range(number_of_winning_places):
            try:
                winners[winners_df.iloc[i]['Club']] += 1
            except KeyError:
                winners[winners_df.iloc[i]['Club']] = 1

    df = pd.DataFrame(list(winners.items()), columns=['Club', 'Wins'])
    df['% winrate'] = round(df['Wins'] / number_of_successful_sims * 100)
    df['Expected odds'] = round(number_of_successful_sims / df['Wins'], 2)
    df = df.sort_values(by=['Wins'], ascending=False).reset_index(drop=True)
    return df

elo_df = get_api_teams_and_elo('2023-09-20', 'POL')
standings_df = build_historical_standings_table_after_n_rounds('106', '2023', 'POL', 'PL', 8, '2023-09-20')
run_multiple_sims('106', '2023', 8, elo_df, standings_df, 100, 3)