In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import poisson, skellam
from scipy.optimize import minimize
from bettools import (
    get_data,
    generate_seasons,
    calculate_poisson_match_outcomes,
    calculate_ev_from_odds,
)
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
from dixon_coles import (
    rho_correction,
    dc_log_like,
    solve_parameters,
    solve_parameters_decay,
)

# Suppress RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.options.mode.chained_assignment = None

In [2]:
def get_1x2_probs(match_score_matrix):
    return dict(
        {
            "H": np.sum(np.tril(match_score_matrix, -1)),
            "A": np.sum(np.triu(match_score_matrix, 1)),
            "D": np.sum(np.diag(match_score_matrix)),
        }
    )


def build_temp_model(dataset, time_diff, xi=0.000, init_params=None):
    test_dataset = dataset[
        (
            (dataset["time_diff"] <= time_diff)
            & (dataset["time_diff"] >= (time_diff - 2))
        )
    ]
    if len(test_dataset) == 0:
        return 0
    train_dataset = dataset[dataset["time_diff"] > time_diff]
    train_dataset["time_diff"] = train_dataset["time_diff"] - time_diff
    params = solve_parameters_decay(train_dataset, xi=xi, init_vals=init_params)
    predictive_score = sum(
        [
            np.log(
                get_1x2_probs(
                    dixon_coles_simulate_match(params, row.HomeTeam, row.AwayTeam)
                )[row.FTR]
            )
            for row in test_dataset.itertuples()
        ]
    )
    return predictive_score


def get_total_score_xi(xi):
    xi_result = [build_temp_model(dc_df, day, xi=xi) for day in range(99, -1, -3)]
    with open("find_xi_1season_{}.txt".format(str(xi)[2:]), "wb") as thefile:
        pickle.dump(xi_result, thefile)


def calc_means(param_dict, homeTeam, awayTeam):
    return [
        np.exp(
            param_dict["attack_" + homeTeam]
            + param_dict["defence_" + awayTeam]
            + param_dict["home_adv"]
        ),
        np.exp(param_dict["defence_" + homeTeam] + param_dict["attack_" + awayTeam]),
    ]


def dixon_coles_simulate_match(params_dict, homeTeam, awayTeam, max_goals=10):
    team_avgs = calc_means(params_dict, homeTeam, awayTeam)
    team_pred = [
        [poisson.pmf(i, team_avg) for i in range(0, max_goals + 1)]
        for team_avg in team_avgs
    ]
    output_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    correction_matrix = np.array(
        [
            [
                rho_correction(
                    home_goals,
                    away_goals,
                    team_avgs[0],
                    team_avgs[1],
                    params_dict["rho"],
                )
                for away_goals in range(2)
            ]
            for home_goals in range(2)
        ]
    )
    output_matrix[:2, :2] = output_matrix[:2, :2] * correction_matrix
    return output_matrix

In [3]:
def walk_forward_validation(data, initial_train_size, test_size, num_iterations):
    total_rows = len(data)

    train_start = 0
    train_end = initial_train_size
    test_end = train_end + test_size

    results = []
    iteration = 0  # To track the number of iterations
    while test_end <= total_rows and iteration < num_iterations:
        # Splitting the data
        train_data = data.iloc[train_start:train_end]
        test_data = data.iloc[train_end:test_end]

        # Calculating time difference from the end of the training set
        max_train_date = train_data.index.max()
        train_data["time_diff"] = (max_train_date - train_data.index).days

        # Selecting columns (adjust as necessary)
        train_data = train_data[["HomeTeam", "AwayTeam", "FTHG", "FTAG", "time_diff"]]

        successful_fit = False
        attempts = 0
        while (
            not successful_fit and attempts < 5
        ):  # Try fitting parameters up to 5 times
            try:
                params = solve_parameters_decay(train_data, xi=0.00325)
                # Insert your prediction and evaluation logic here, using `params`
                # For each row in test_data, calculate predictions and append to test_data
                test_data = test_data.reset_index()
                for i in range(len(test_data)):
                    home_team = test_data.loc[i]["HomeTeam"]
                    away_team = test_data.loc[i]["AwayTeam"]
                    probs_1x2 = get_1x2_probs(
                        dixon_coles_simulate_match(
                            params, home_team, away_team, max_goals=10
                        )
                    )
                    test_data.loc[i, "home_win_prob"] = probs_1x2["H"]
                    test_data.loc[i, "away_win_prob"] = probs_1x2["A"]
                    test_data.loc[i, "draw_win_prob"] = probs_1x2["D"]
                results.append(test_data)
                successful_fit = True
            except Exception as e:
                # If parameter fitting fails, add more data to the training set and try again
                print(
                    f"Parameter fitting failed on attempt {attempts + 1}: {e}. Trying with a larger training set."
                )
                train_end += test_size  # Expanding the training set window
                if train_end + test_size > total_rows:
                    print(
                        "Not enough data to expand the training set and perform another test. Stopping."
                    )
                    return results
                train_data = data.iloc[train_start:train_end]
                train_data["time_diff"] = (max_train_date - train_data.index).days
                train_data = train_data[
                    ["HomeTeam", "AwayTeam", "FTHG", "FTAG", "time_diff"]
                ]
                attempts += 1
        if successful_fit:
            # Move the window forward
            train_end = test_end
            test_end += test_size
            iteration += 1  # Increment the iteration count
    return results

In [4]:
from concurrent.futures import ProcessPoolExecutor
from process_chunk import process_chunk
from tqdm import tqdm  # For progress reporting


def walk_forward_validation_parallel(
    data, initial_train_size, test_size, num_iterations
):
    total_rows = len(data)

    # Calculate the start and end indices for each chunk
    indices = [
        (i * test_size, initial_train_size + i * test_size, test_size)
        for i in range(num_iterations)
        if initial_train_size + (i + 1) * test_size <= total_rows
    ]

    with ProcessPoolExecutor(max_workers=7) as executor:
        # Prepare a list of tasks
        tasks = [(data, *idx) for idx in indices]
        # Setup progress bar with total equal to the number of tasks
        progress_bar = tqdm(total=len(tasks), desc="Processing Chunks")

        # Execute tasks and update progress bar as each task completes
        results = []
        for _ in executor.map(process_chunk, tasks):
            progress_bar.update(1)  # Update progress for each completed task
            results.append(_)  # Store result (optional, depending on your needs)

        # Close the progress bar upon completion
        progress_bar.close()

    return results


leagues = [["E0"], ["E1"], ["E2"], ["E3"]]
for league in leagues:
    season_list = generate_seasons(2014, 2024)
    df_ls = get_data(season_list, league)
    main_df = pd.concat(df_ls)
    main_df = main_df.dropna()
    main_df["Date"] = pd.to_datetime(main_df["Date"])
    main_df = main_df.sort_values("Date")
    main_df.set_index("Date", inplace=True)
    res = walk_forward_validation_parallel(
        main_df, 500, 10, int((len(main_df) - 500) / 10)
    )
    results = pd.concat(res)
    results.to_csv(f"results_dc_{league}.csv")

Processing Chunks: 100%|████████████████████████████████| 321/321 [3:08:19<00:00, 35.20s/it]


Optimization terminated successfully    (Exit mode 0)
            Current function value: 667.6007251543847
            Iterations: 49
            Function evaluations: 2461
            Gradient evaluations: 49
Optimization terminated successfully    (Exit mode 0)
            Current function value: 719.3346773587812
            Iterations: 49
            Function evaluations: 2465
            Gradient evaluations: 49
Optimization terminated successfully    (Exit mode 0)
            Current function value: 763.3214258595086
            Iterations: 54
            Function evaluations: 2719
            Gradient evaluations: 54
Optimization terminated successfully    (Exit mode 0)
            Current function value: 771.6443895772172
            Iterations: 49
            Function evaluations: 2468
            Gradient evaluations: 49
Optimization terminated successfully    (Exit mode 0)
            Current function value: 603.2382949079465
            Iterations: 68
            Function 

Processing Chunks: 100%|██████████| 397/397 [4:45:31<00:00, 43.15s/it]


Optimization terminated successfully    (Exit mode 0)
            Current function value: 699.5284187654115
            Iterations: 74
            Function evaluations: 4596
            Gradient evaluations: 74
Optimization terminated successfully    (Exit mode 0)
            Current function value: 739.1841482677721
            Iterations: 74
            Function evaluations: 4601
            Gradient evaluations: 74
Optimization terminated successfully    (Exit mode 0)
            Current function value: 722.6926877220812
            Iterations: 76
            Function evaluations: 4720
            Gradient evaluations: 76
Optimization terminated successfully    (Exit mode 0)
            Current function value: 746.716661696279
            Iterations: 79
            Function evaluations: 4905
            Gradient evaluations: 79
Optimization terminated successfully    (Exit mode 0)
            Current function value: 782.5886851478682
            Iterations: 74
            Function e