In [None]:
import collections
import warnings
import ScraperFC as sfc  # LEAVE THIS!
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import poisson


In [None]:
def calculate_rps(probs, outcome):
    """
    Calculate the Ranked Probability Score

    Parameters
    ----------
    probs : list
        A list of the predicted probabilities of each outcome occurring

    outcome : int
        An integer designating which index in `probs` was the observed outcome

    Returns
    -------
    float
        The Ranked Probability Score as floating point number
    """
    cum_probs = np.cumsum(probs)
    cum_outcomes = np.zeros(len(probs))
    cum_outcomes[outcome] = 1
    cum_outcomes = np.cumsum(cum_outcomes)

    sum_rps = 0
    for i in range(len(probs)):
        sum_rps += (cum_probs[i] - cum_outcomes[i]) ** 2

    return sum_rps / (len(probs) - 1)


In [None]:
def rho_correction_vec(df):
    dc_adj = np.select(
        [
            (df["goals_home"] == 0) & (df["goals_away"] == 0),
            (df["goals_home"] == 0) & (df["goals_away"] == 1),
            (df["goals_home"] == 1) & (df["goals_away"] == 0),
            (df["goals_home"] == 1) & (df["goals_away"] == 1),
        ],
        [
            1 - (df["home_exp"] * df["away_exp"] * df["rho"]),
            1 + (df["home_exp"] * df["rho"]),
            1 + (df["away_exp"] * df["rho"]),
            1 - df["rho"],
        ],
        default=1,
    )
    return dc_adj

In [None]:
def dc_decay(xi, t):
    return np.exp(-xi * t)

In [None]:
class FootballProbabilityGrid(list):
    def __init__(
        self,
        goal_matrix: list,
        home_goal_expectation: float,
        away_goal_expectation: float,
    ):
        list.__init__(self, goal_matrix)
        self.home_goal_expectation = home_goal_expectation
        self.away_goal_expectation = away_goal_expectation

        # Compute marginal probabilities for home and away shots on target
        self.home_sot_probabilities = np.sum(self, axis=1)
        self.away_sot_probabilities = np.sum(self, axis=0)


    def __repr__(self):
        repr_str = ""
        repr_str += "Module: Penaltyblog"
        repr_str += "\n"
        repr_str += "\n"

        repr_str += "Class: FootballProbabilityGrid"
        repr_str += "\n"
        repr_str += "\n"

        repr_str += "Home Goal Expectation: {0}".format(self.home_goal_expectation)
        repr_str += "\n"
        repr_str += "Away Goal Expectation: {0}".format(self.away_goal_expectation)
        repr_str += "\n"
        repr_str += "\n"

        repr_str += "Home Win: {0}".format(self.home_win)
        repr_str += "\n"
        repr_str += "Draw: {0}".format(self.draw)
        repr_str += "\n"
        repr_str += "Away Win: {0}".format(self.away_win)
        repr_str += "\n"

        repr_str += "BTTS: {0}".format(self.both_teams_to_score)
        repr_str += "\n"
        return repr_str

    def __str__(self):
        return self.__repr__()

    def _sum(self, func):
        return sum(
            [
                self[a][b]
                for a in range(len(self))
                for b in range(len(self))
                if func(a, b)
            ]
        )

    @property
    def home_win(self) -> float:
        """
        Probability of home win

        Returns
        ------
        float
            Probability of home win
        """
        return self._sum(lambda a, b: a > b)

    @property
    def draw(self) -> float:
        """
        Probability of draw

        Returns
        ------
        float
            Probability of draw
        """
        return self._sum(lambda a, b: a == b)

    @property
    def away_win(self) -> float:
        """
        Probability of away win

        Returns
        ------
        float
            Probability of away win
        """
        return self._sum(lambda a, b: a < b)

    @property
    def both_teams_to_score(self) -> float:
        """
        Probability of both teams scoring

        Returns
        ------
        float
            Probability of both teams scoring
        """
        return self._sum(lambda a, b: a > 0 and b > 0)

    @property
    def home_draw_away(self) -> list:
        """
        1x2 Probabilities

        Returns
        ------
        list
            Probability of home win
        """
        return [self.home_win, self.draw, self.away_win]

    def total_goals(self, over_under: str, strike: float) -> float:
        """
        Predicts the probabilities of `total goals` market

        Parameters
        ----------
        over_under : str
            Whether probabilities are for over / under the
            total goals value - must be one of ['over', 'under']

        strike : float
            The total goals value for the market

        Returns
        ------
        float
            Probability of over / under the strike occurring
        """
        if over_under == "over":
            func = lambda a, b: a + b > strike
        elif over_under == "under":
            func = lambda a, b: a + b < strike
        else:
            raise ValueError("over_under must be one of ['over', 'under']")
        return self._sum(func)

    def asian_handicap(self, home_away: str, strike: float) -> float:
        """
        Predicts the probabilities of `asian handicap` market

        Parameters
        ----------
        home_away : str
            Whether probabilities are for home / away team -
            must be one of ['home', 'away']

        goals : float
            The total goals value

        Returns
        ------
        float
            Probability of home / away team outscoring the strike
        """
        if home_away == "home":
            func = lambda a, b: a - b > strike
        elif home_away == "away":
            func = lambda a, b: b - a > strike
        else:
            raise ValueError("home_away must be one of ['home', 'away']")
        return self._sum(func)

In [None]:
class DixonColesGoalModel:
    """Dixon and Coles adjusted Poisson model for predicting outcomes of football
    (soccer) matches

    Methods
    -------
    fit()
        fits a Dixon and Coles adjusted Poisson model to the data to calculate the team strengths.
        Must be called before the model can be used to predict game outcomes

    predict(home_team, away_team, max_goals=15)
        predict the outcome of a football (soccer) game between the home_team and away_team

    get_params()
        Returns the fitted parameters from the model
    """

    def __init__(self, goals_home, goals_away, teams_home, teams_away, weights=1):
        """
        Parameters
        ----------
        goals_home : list
            A list or pd.Series of goals scored by the home_team
        goals_away : list
            A list or pd.Series of goals scored by the away_team
        teams_home : list
            A list or pd.Series of team_names for the home_team
        teams_away : list
            A list or pd.Series of team_names for the away_team
        weights : list
            A list or pd.Series of weights for the data,
            the lower the weight the less the match has on the output
        """

        self.fixtures = pd.DataFrame([goals_home, goals_away, teams_home, teams_away]).T
        self.fixtures.columns = ["goals_home", "goals_away", "team_home", "team_away"]
        self.fixtures["goals_home"] = self.fixtures["goals_home"].astype(int)
        self.fixtures["goals_away"] = self.fixtures["goals_away"].astype(int)
        self.fixtures["weights"] = weights

        self.teams = np.sort(np.unique(np.concatenate([teams_home, teams_away])))
        self.n_teams = len(self.teams)

        self._params = np.concatenate(
            (
                [1] * self.n_teams,
                [-1] * self.n_teams,
                [0.25],  # home advantage
                [-0.1],  # rho
            )
        )

        self._res = None
        self.loglikelihood = None
        self.aic = None
        self.n_params = None
        self.fitted = False

    def __repr__(self):
        repr_str = ""
        repr_str += "Module: Penaltyblog"
        repr_str += "\n"
        repr_str += "\n"

        repr_str += "Model: Dixon and Coles"
        repr_str += "\n"
        repr_str += "\n"

        if not self.fitted:
            repr_str += "Status: Model not fitted"
            return repr_str

        repr_str += "Number of parameters: {0}".format(self.n_params)
        repr_str += "\n"
        repr_str += "Log Likelihood: {0}".format(round(self.loglikelihood, 3))
        repr_str += "\n"
        repr_str += "AIC: {0}".format(round(self.aic, 3))
        repr_str += "\n"
        repr_str += "\n"

        repr_str += "{0: <20} {1:<20} {2:<20}".format("Team", "Attack", "Defence")
        repr_str += "\n"
        repr_str += "-" * 60
        repr_str += "\n"

        for idx, team in enumerate(self.teams):
            repr_str += "{0: <20} {1:<20} {2:<20}".format(
                self.teams[idx],
                round(self._params[idx], 3),
                round(self._params[idx + self.n_teams], 3),
            )
            repr_str += "\n"

        repr_str += "-" * 60
        repr_str += "\n"

        repr_str += "Home Advantage: {0}".format(round(self._params[-2], 3))
        repr_str += "\n"
        repr_str += "Rho: {0}".format(round(self._params[-1], 3))
        repr_str += "\n"

        return repr_str

    def __str__(self):
        return self.__repr__()

    @staticmethod
    def _fit(params, fixtures, teams):
        """
        Internal method, not to called directly by the user
        """
        n_teams = len(teams)

        params_df = (
            pd.DataFrame(params[:n_teams], columns=["attack"])
            .assign(defence=params[n_teams : n_teams * 2])
            .assign(team=teams)
        )

        df2 = (
            fixtures.merge(params_df, left_on="team_home", right_on="team")
            .rename(columns={"attack": "home_attack", "defence": "home_defence"})
            .drop("team", axis=1)
            .merge(params_df, left_on="team_away", right_on="team")
            .rename(columns={"attack": "away_attack", "defence": "away_defence"})
            .assign(hfa=params[-2])
            .assign(rho=params[-1])
        )

        df2["home_exp"] = np.exp(df2["hfa"] + df2["home_attack"] + df2["away_defence"])
        df2["away_exp"] = np.exp(df2["away_attack"] + df2["home_defence"])
        df2["home_llk"] = poisson.logpmf(df2["goals_home"], df2["home_exp"])
        df2["away_llk"] = poisson.logpmf(df2["goals_away"], df2["away_exp"])
        df2["dc_adj"] = rho_correction_vec(df2)

        df2["llk"] = (df2["home_llk"] + df2["away_llk"] + np.log(df2["dc_adj"])) * df2[
            "weights"
        ]

        return -df2["llk"].sum()

    def fit(self):
        """
        Fits the model to the data and calculates the team strengths,
        home advantage and intercept. Should be called before `predict` can be used
        """
        options = {
            "maxiter": 100,
            "disp": False,
        }

        constraints = [
            {
                "type": "eq",
                "fun": lambda x: sum(x[: self.n_teams]) - self.n_teams,
            }
        ]

        bounds = [(-3, 3)] * self.n_teams
        bounds += [(-3, 3)] * self.n_teams
        bounds += [(0, 2)]
        bounds += [(-2, 2)]

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            self._res = minimize(
                self._fit,
                self._params,
                args=(self.fixtures, self.teams),
                constraints=constraints,
                bounds=bounds,
                options=options,
            )

        self._params = self._res["x"]
        self.n_params = len(self._params)
        self.loglikelihood = self._res["fun"] * -1
        self.aic = -2 * (self.loglikelihood) + 2 * self.n_params
        self.fitted = True

    def predict(self, home_team, away_team, max_goals=15):
        """
        Predicts the probabilities of the different possible match outcomes

        Parameters
        ----------
        home_team : str
            The name of the home_team, must have been in the data the model was fitted on

        away_team : str
            The name of the away_team, must have been in the data the model was fitted on

        max_goals : int
            The maximum number of goals to calculate the probabilities over.
            Reducing this will improve performance slightly at the expensive of acuuracy

        Returns
        -------
        FootballProbabilityGrid
            A class providing access to a range of probabilites,
            such as 1x2, asian handicaps, over unders etc
        """
        # check the model has been fit
        if not self.fitted:
            raise ValueError(
                (
                    "Model's parameters have not been fit yet, please call the `fit()` "
                    "function before making any predictions"
                )
            )

        if isinstance(home_team, str) and isinstance(away_team, str):
            return self._predict(home_team, away_team, max_goals)

        elif isinstance(home_team, collections.abc.Sequence) and isinstance(
            away_team, collections.abc.Sequence
        ):
            results = [
                self._predict(x[0], x[1], max_goals) for x in zip(home_team, away_team)
            ]
            return results

        else:
            raise ValueError("Team data types not recognised")

    def _predict(self, home_team, away_team, max_goals=15):
        """
        Predicts the probabilities of the different possible match outcomes

        Parameters
        ----------
        home_team : str
            The name of the home_team, must have been in the data the model was fitted on

        away_team : str
            The name of the away_team, must have been in the data the model was fitted on

        max_goals : int
            The maximum number of goals to calculate the probabilities over.
            Reducing this will improve performance slightly at the expensive of acuuracy

        Returns
        -------
        FootballProbabilityGrid
            A class providing access to a range of probabilites,
            such as 1x2, asian handicaps, over unders etc
        """

        # check we have parameters for teams
        if home_team not in self.teams:
            raise ValueError(
                (
                    "No parameters for home team - "
                    "please ensure the team was included in the training data"
                )
            )

        if away_team not in self.teams:
            raise ValueError(
                (
                    "No parameters for away team - "
                    "please ensure the team was included in the training data"
                )
            )

        # get the relevant model parameters
        home_idx = np.where(self.teams == home_team)[0][0]
        away_idx = np.where(self.teams == away_team)[0][0]

        home_attack = self._params[home_idx]
        away_attack = self._params[away_idx]

        home_defence = self._params[home_idx + self.n_teams]
        away_defence = self._params[away_idx + self.n_teams]

        home_advantage = self._params[-2]
        rho = self._params[-1]

        # calculate the goal expectation
        home_goals = np.exp(home_advantage + home_attack + away_defence)
        away_goals = np.exp(away_attack + home_defence)
        home_goals_vector = poisson(home_goals).pmf(np.arange(0, max_goals))
        away_goals_vector = poisson(away_goals).pmf(np.arange(0, max_goals))

        # get the probabilities for each possible score
        m = np.outer(home_goals_vector, away_goals_vector)

        # apply Dixon and Coles adjustment
        m[0, 0] *= 1 - home_goals * away_goals * rho
        m[0, 1] *= 1 + home_goals * rho
        m[1, 0] *= 1 + away_goals * rho
        m[1, 1] *= 1 - rho

        # and return the FootballProbabilityGrid
        probability_grid = FootballProbabilityGrid(m, home_goals, away_goals)

        return probability_grid

    def get_params(self):
        """
        Provides access to the model's fitted parameters

        Returns
        -------
        dict
            A dict containing the model's parameters
        """
        if not self.fitted:
            raise ValueError(
                "Model's parameters have not been fit yet, please call the `fit()` function first"
            )

        params = dict(
            zip(
                ["attack_" + team for team in self.teams]
                + ["defence_" + team for team in self.teams]
                + ["home_advantage", "rho"],
                self._res["x"],
            )
        )
        return params

In [None]:
class DixonColesGoalModel_sot:
    """Dixon and Coles adjusted Poisson model for predicting outcomes of football
    (soccer) matches

    Methods
    -------
    fit()
        fits a Dixon and Coles adjusted Poisson model to the data to calculate the team strengths.
        Must be called before the model can be used to predict game outcomes

    predict(home_team, away_team, max_goals=15)
        predict the outcome of a football (soccer) game between the home_team and away_team

    get_params()
        Returns the fitted parameters from the model

    fit_shots_on_target()
        Fits the model for shots on target using the provided data

    predict_shots_on_target(home_team, away_team, max_shots=15)
        Predicts the shots on target between the home_team and away_team
    """

    def __init__(
        self,
        goals_home,
        goals_away,
        teams_home,
        teams_away,
        shots_on_target_home=None,
        shots_on_target_away=None,
        weights=1,
    ):
        """
        Parameters
        ----------
        goals_home : list
            A list or pd.Series of goals scored by the home_team
        goals_away : list
            A list or pd.Series of goals scored by the away_team
        teams_home : list
            A list or pd.Series of team_names for the home_team
        teams_away : list
            A list or pd.Series of team_names for the away_team
        shots_on_target_home : list, optional
            A list or pd.Series of shots on target by the home_team
        shots_on_target_away : list, optional
            A list or pd.Series of shots on target by the away_team
        weights : list
            A list or pd.Series of weights for the data,
            the lower the weight the less the match has on the output
        """

        self.fixtures = pd.DataFrame([goals_home, goals_away, teams_home, teams_away]).T
        self.fixtures.columns = ["goals_home", "goals_away", "team_home", "team_away"]
        self.fixtures["goals_home"] = self.fixtures["goals_home"].astype(int)
        self.fixtures["goals_away"] = self.fixtures["goals_away"].astype(int)
        self.fixtures["weights"] = weights

        if shots_on_target_home is not None and shots_on_target_away is not None:
            self.fixtures["shots_on_target_home"] = shots_on_target_home
            self.fixtures["shots_on_target_away"] = shots_on_target_away
        else:
            self.fixtures["shots_on_target_home"] = np.nan
            self.fixtures["shots_on_target_away"] = np.nan

        self.teams = np.sort(np.unique(np.concatenate([teams_home, teams_away])))
        self.n_teams = len(self.teams)

        self._params = np.concatenate(
            (
                [1] * self.n_teams,
                [-1] * self.n_teams,
                [0.25],  # home advantage
                [-0.1],  # rho
            )
        )

        self._res = None
        self.loglikelihood = None
        self.aic = None
        self.n_params = None
        self.fitted = False

        # Initialize variables for shots on target model
        self._params_sot = None
        self._res_sot = None
        self.loglikelihood_sot = None
        self.aic_sot = None
        self.n_params_sot = None
        self.fitted_sot = False

    # ... [existing methods: __repr__, __str__, _fit, fit, predict, _predict, get_params] ...

    def fit_shots_on_target(self):
        """
        Fits the model to the data for shots on target and calculates the team strengths,
        home advantage, and rho for shots on target.
        Should be called before `predict_shots_on_target` can be used.
        """
        if self.fixtures["shots_on_target_home"].isnull().any():
            raise ValueError(
                "Shots on target data is missing. Please provide shots on target data in the __init__ method."
            )

        # Initialize parameters for shots on target model
        self._params_sot = np.concatenate(
            (
                [1] * self.n_teams,   # attack parameters
                [-1] * self.n_teams,  # defence parameters
                [0.25],               # home advantage
                [-0.1],               # rho
            )
        )

        options = {
            "maxiter": 100,
            "disp": False,
        }

        constraints = [
            {
                "type": "eq",
                "fun": lambda x: sum(x[: self.n_teams]) - self.n_teams,
            }
        ]

        bounds = [(-3, 3)] * self.n_teams
        bounds += [(-3, 3)] * self.n_teams
        bounds += [(0, 2)]
        bounds += [(-2, 2)]

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            self._res_sot = minimize(
                self._fit_shots_on_target,
                self._params_sot,
                args=(self.fixtures, self.teams),
                constraints=constraints,
                bounds=bounds,
                options=options,
            )

        self._params_sot = self._res_sot["x"]
        self.n_params_sot = len(self._params_sot)
        self.loglikelihood_sot = self._res_sot["fun"] * -1
        self.aic_sot = -2 * (self.loglikelihood_sot) + 2 * self.n_params_sot
        self.fitted_sot = True

    def _fit_shots_on_target(self, params, fixtures, teams):
        """
        Internal method for fitting the shots on target model.
        Not to be called directly by the user.
        """
        n_teams = len(teams)

        params_df = (
            pd.DataFrame(params[:n_teams], columns=["attack"])
            .assign(defence=params[n_teams : n_teams * 2])
            .assign(team=teams)
        )

        df2 = (
            fixtures.merge(params_df, left_on="team_home", right_on="team")
            .rename(columns={"attack": "home_attack", "defence": "home_defence"})
            .drop("team", axis=1)
            .merge(params_df, left_on="team_away", right_on="team")
            .rename(columns={"attack": "away_attack", "defence": "away_defence"})
            .assign(hfa=params[-2])
            .assign(rho=params[-1])
        )

        df2["home_exp"] = np.exp(df2["hfa"] + df2["home_attack"] + df2["away_defence"])
        df2["away_exp"] = np.exp(df2["away_attack"] + df2["home_defence"])

        df2["home_llk"] = poisson.logpmf(df2["shots_on_target_home"], df2["home_exp"])
        df2["away_llk"] = poisson.logpmf(df2["shots_on_target_away"], df2["away_exp"])
        df2["dc_adj"] = rho_correction_vec(df2)

        df2["llk"] = (df2["home_llk"] + df2["away_llk"] + np.log(df2["dc_adj"])) * df2["weights"]

        return -df2["llk"].sum()

    def predict_shots_on_target(self, home_team, away_team, max_shots=15):
        """
        Predicts the probabilities of the different possible shots on target outcomes

        Parameters
        ----------
        home_team : str
            The name of the home_team, must have been in the data the model was fitted on

        away_team : str
            The name of the away_team, must have been in the data the model was fitted on

        max_shots : int
            The maximum number of shots on target to calculate the probabilities over.
            Reducing this will improve performance slightly at the expensive of accuracy

        Returns
        -------
        FootballProbabilityGrid
            A class providing access to a range of probabilities,
            such as over/unders for shots on target
        """
        # Check if the model has been fitted
        if not self.fitted_sot:
            raise ValueError(
                (
                    "Shots on target model's parameters have not been fit yet, "
                    "please call the `fit_shots_on_target()` function before making any predictions"
                )
            )

        if isinstance(home_team, str) and isinstance(away_team, str):
            return self._predict_shots_on_target(home_team, away_team, max_shots)

        elif isinstance(home_team, collections.abc.Sequence) and isinstance(
            away_team, collections.abc.Sequence
        ):
            results = [
                self._predict_shots_on_target(x[0], x[1], max_shots)
                for x in zip(home_team, away_team)
            ]
            return results

        else:
            raise ValueError("Team data types not recognised")

    def _predict_shots_on_target(self, home_team, away_team, max_shots=15):
        """
        Predicts the probabilities of the different possible shots on target outcomes

        Parameters
        ----------
        home_team : str
            The name of the home_team, must have been in the data the model was fitted on

        away_team : str
            The name of the away_team, must have been in the data the model was fitted on

        max_shots : int
            The maximum number of shots on target to calculate the probabilities over.
            Reducing this will improve performance slightly at the expensive of accuracy

        Returns
        -------
        FootballProbabilityGrid
            A class providing access to a range of probabilities,
            such as over/unders for shots on target
        """

        # Check we have parameters for teams
        if home_team not in self.teams:
            raise ValueError(
                (
                    "No parameters for home team - "
                    "please ensure the team was included in the training data"
                )
            )

        if away_team not in self.teams:
            raise ValueError(
                (
                    "No parameters for away team - "
                    "please ensure the team was included in the training data"
                )
            )

        # Get the relevant model parameters
        home_idx = np.where(self.teams == home_team)[0][0]
        away_idx = np.where(self.teams == away_team)[0][0]

        home_attack = self._params_sot[home_idx]
        away_attack = self._params_sot[away_idx]

        home_defence = self._params_sot[home_idx + self.n_teams]
        away_defence = self._params_sot[away_idx + self.n_teams]

        home_advantage = self._params_sot[-2]
        rho = self._params_sot[-1]

        # Calculate the shots on target expectation
        home_sot = np.exp(home_advantage + home_attack + away_defence)
        away_sot = np.exp(away_attack + home_defence)
        home_sot_vector = poisson(home_sot).pmf(np.arange(0, max_shots))
        away_sot_vector = poisson(away_sot).pmf(np.arange(0, max_shots))

        # Get the probabilities for each possible shots on target score
        m = np.outer(home_sot_vector, away_sot_vector)

        # Apply Dixon and Coles adjustment
        m[0, 0] *= 1 - home_sot * away_sot * rho
        m[0, 1] *= 1 + home_sot * rho
        m[1, 0] *= 1 + away_sot * rho
        m[1, 1] *= 1 - rho

        # Return the FootballProbabilityGrid
        probability_grid = FootballProbabilityGrid(m, home_sot, away_sot)

        return probability_grid

    def get_shots_on_target_params(self):
        """
        Provides access to the model's fitted parameters for shots on target

        Returns
        -------
        dict
            A dict containing the model's parameters for shots on target
        """
        if not self.fitted_sot:
            raise ValueError(
                "Shots on target model's parameters have not been fit yet, please call the `fit_shots_on_target()` function first"
            )

        params = dict(
            zip(
                ["attack_" + team for team in self.teams]
                + ["defence_" + team for team in self.teams]
                + ["home_advantage", "rho"],
                self._params_sot,
            )
        )
        return params


In [None]:
def calculate_mean_rps(dc, df):
    rps_values = list()
    for idx, row in df.iterrows():
        # Get the predicted probabilities for home, draw, and away outcomes
        predictions = dc.predict(row["HomeTeam"], row["AwayTeam"]).home_draw_away

        # Append the RPS for the current prediction and outcome
        rps_values.append(calculate_rps(predictions, outcome))

    return np.mean(rps_values)

In [None]:
# df1 = pd.read_csv('data/prem/23-24.csv')
# df2 = pd.read_csv('data/prem/24-25.csv')

# df3 = pd.read_csv('data/france/23-24.csv')
# df4 = pd.read_csv('data/france/24-25.csv')

# df5 = pd.read_csv('data/laliga/23-24.csv')
# df6 = pd.read_csv('data/laliga/24-25.csv')

# df7 = pd.read_csv('data/portugal/23-24.csv')
# df8 = pd.read_csv('data/portugal/24-25.csv')

# df9 = pd.read_csv('data/seriea/23-24.csv')
# df10 = pd.read_csv('data/seriea/24-25.csv')

# df11 = pd.read_csv('data/bundesliga/23-24.csv')
# df12 = pd.read_csv('data/bundesliga/24-25.csv')

# df13 = pd.read_csv('data/netherlands/23-24.csv')
# df14 = pd.read_csv('data/netherlands/24-25.csv')

# df15 = pd.read_csv('data/scotland/23-24.csv')
# df16 = pd.read_csv('data/scotland/24-25.csv')

# df17 = pd.read_csv('data/belgium/23-24.csv')
# df18 = pd.read_csv('data/belgium/24-25.csv')
# df18.dropna(inplace=True)

# df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18], ignore_index=True)

df = pd.read_csv('europa_data_final.csv')

In [None]:
# # Sort the data by date in ascending order
# df["Date"] = pd.to_datetime(df["Date"], format='%d/%m/%Y', errors='coerce')
# df = df.sort_values(by="Date")

# rps_values = list()


In [None]:
from sklearn.model_selection import train_test_split
# Ensure the 'date' column is in datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') 

train = df.copy()  # Excludes the last 466 entries
test = df.iloc[-600:].copy() 

# Calculate days since the latest date in the training set
train["days_since"] = (train["date"].max() - train["date"]).dt.days

# Check for missing teams in test compared to train
train_teams = set(train['homeTeam']).union(set(train['awayTeam']))
test_teams = set(test['homeTeam']).union(set(test['awayTeam']))
missing_teams = test_teams - train_teams

if missing_teams:
    print(f"Warning: The following teams are missing in the training data: {missing_teams}")

# Calculate weight based on dc_decay
train["weight"] = dc_decay(0.02, train["days_since"])

# Fit the model using the training data
dc = DixonColesGoalModel(
    train["homeScore"],
    train["awayScore"],
    train["homeTeam"],
    train["awayTeam"],
    train["weight"],
)
dc.fit()



In [None]:
# Drop rows with null values in either 'hometotalShotsOnGoal' or 'awaytotalShotsOnGoal'
df = df.dropna(subset=['hometotalShotsOnGoal', 'awaytotalShotsOnGoal'])




In [None]:
len(df.iloc[:-500])

In [None]:
train = df.iloc[:-600].copy()  # Excludes the last 466 entries
test = df.iloc[-600:].copy()   # Includes only the last 466 entries
# Calculate days since the latest date in the training set
train["days_since"] = (train["date"].max() - train["date"]).dt.days

# Check for missing teams in test compared to train
train_teams = set(train['homeTeam']).union(set(train['awayTeam']))
test_teams = set(test['homeTeam']).union(set(test['awayTeam']))
missing_teams = test_teams - train_teams

if missing_teams:
    print(f"Warning: The following teams are missing in the training data: {missing_teams}")

# Calculate weight based on dc_decay
train["weight"] = dc_decay(0.0001, train["days_since"])

# Fit the model using the training data
dc_sot = DixonColesGoalModel_sot(
    goals_home=train["homeScore"],
    goals_away=train["awayScore"],
    teams_home=train["homeTeam"],
    teams_away=train["awayTeam"],
    shots_on_target_home=train["hometotalShotsOnGoal"],
    shots_on_target_away=train["awaytotalShotsOnGoal"],
    weights=train["weight"],
)
dc_sot.fit_shots_on_target()


In [None]:
sot_params = dc_sot.get_shots_on_target_params()
sot_params

In [None]:
params = dc.get_params()
dc.get_params

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

def predict_ST(params, home_team, away_team):
    """
    Predicts the number of shots for home and away teams using a Poisson model,
    calculates probabilities for over/under shot thresholds, and determines
    which team is more likely to have more shots.

    Parameters:
    - params (dict): Model parameters including attack and defence strengths,
                     home advantage, and rho for correlation.
    - home_team (str): Name of the home team.
    - away_team (str): Name of the away team.

    Returns:
    - home_shots_prob (float): Probability of home team winning in shots.
    - draw_shots_prob (float): Probability of a draw in shots.
    - away_shots_prob (float): Probability of away team winning in shots.
    - over_under_shots_team (dict): Over/Under probabilities for specified thresholds per team.
    - over_under_shots_total (dict): Over/Under probabilities for specified thresholds total shots.
    """
    # Extract parameters
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_advantage"]
    rho = params["rho"]  # Correlation parameter for shots

    # Calculate expected shots using exponential to ensure positivity
    home_shot_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_shot_expectation = np.exp(away_attack + home_defence)

    # Poisson probabilities for shots from 0 to 25 (adjust as needed)
    max_shots = 20
    home_probs = poisson.pmf(range(max_shots + 1), home_shot_expectation)
    away_probs = poisson.pmf(range(max_shots + 1), away_shot_expectation)

    # Shot probability matrix (Home Shots x Away Shots)
    m = np.outer(home_probs, away_probs)

    # Adjust for correlation (rho)
    # This simplistic adjustment modifies the (0,0), (0,1), (1,0), and (1,1) cells.
    # For a more accurate correlation adjustment, consider more sophisticated methods.
    if rho != 0:
        m[0, 0] *= 1 - home_shot_expectation * away_shot_expectation * rho
        m[0, 1] *= 1 + home_shot_expectation * rho
        m[1, 0] *= 1 + away_shot_expectation * rho
        m[1, 1] *= 1 - rho

    # Ensure all probabilities are non-negative and normalize
    m = np.maximum(m, 0)
    m /= m.sum()

    # Calculate Dominant Shot Probabilities
    home_shots_prob = np.sum(np.tril(m, -1))  # Home shots > Away shots
    draw_shots_prob = np.sum(np.diag(m))      # Home shots == Away shots
    away_shots_prob = np.sum(np.triu(m, 1))   # Away shots > Home shots

    # Print Cheat Sheet
    print(f'{home_team} VS {away_team} - Shots Prediction Cheat Sheet')
    print('============================================================')
    print(f'Probability {home_team} has more shots: {home_shots_prob:.4f}')
    print(f'Probability of a draw in shots: {draw_shots_prob:.4f}')
    print(f'Probability {away_team} has more shots: {away_shots_prob:.4f}')

    # Calculate Over/Under Shot Probabilities for Each Team
    def over_under_team_prob(probs, threshold):
        # Under probability: P(shots <= threshold)
        under_prob = np.sum(probs[:int(threshold)+1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    # Individual team shot thresholds
    team_shot_thresholds = [3, 4, 5, 7, 9]  # Adjust thresholds as needed
    over_under_shots_team = {home_team: {}, away_team: {}}

    print("\nOver/Under Shots Per Team:")
    for threshold in team_shot_thresholds:
        # Home team probabilities
        under_prob_home, over_prob_home = over_under_team_prob(home_probs, threshold)
        over_under_shots_team[home_team][threshold] = {
            'Under': under_prob_home,
            'Over': over_prob_home
        }

        # Away team probabilities
        under_prob_away, over_prob_away = over_under_team_prob(away_probs, threshold)
        over_under_shots_team[away_team][threshold] = {
            'Under': under_prob_away,
            'Over': over_prob_away
        }

        print(f"\nThreshold {threshold} Shots:")
        print(f"{home_team} - Under {threshold} shots: {under_prob_home:.4f}, Over {threshold} shots: {over_prob_home:.4f}")
        print(f"{away_team} - Under {threshold} shots: {under_prob_away:.4f}, Over {threshold} shots: {over_prob_away:.4f}")

    # Calculate Over/Under Shot Probabilities for Total Shots
    def over_under_total_prob(m, threshold):
        total_probs = np.zeros((max_shots * 2) + 1)
        for i in range(max_shots + 1):
            for j in range(max_shots + 1):
                total_probs[i + j] += m[i, j]
        # Under probability: P(total shots <= threshold)
        under_prob = np.sum(total_probs[:int(threshold)+1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    # Total shots thresholds
    total_shot_thresholds = [5, 7, 9, 11]  # Adjust thresholds as needed
    over_under_shots_total = {}
    print("\nOver/Under Total Shots:")
    for threshold in total_shot_thresholds:
        under_prob_total, over_prob_total = over_under_total_prob(m, threshold)
        over_under_shots_total[threshold] = {
            'Under': under_prob_total,
            'Over': over_prob_total
        }
        print(f"Under {threshold} total shots: {under_prob_total:.4f}, Over {threshold} total shots: {over_prob_total:.4f}")

    # Calculate Betting Odds
    def calculate_odds(prob):
        epsilon = 1e-8  # To prevent division by zero
        return 1 / (prob + epsilon)

    # Match outcome odds based on shots
    home_shots_odds = calculate_odds(home_shots_prob)
    draw_shots_odds = calculate_odds(draw_shots_prob)
    away_shots_odds = calculate_odds(away_shots_prob)

    print("\nShot Outcome Odds (Decimal):")
    print(f"{home_team} More Shots: {home_shots_odds:.2f}")
    print(f"Draw in Shots: {draw_shots_odds:.2f}")
    print(f"{away_team} More Shots: {away_shots_odds:.2f}")

    # Over/Under odds for shots per team
    print("\nOver/Under Odds Per Team:")
    for threshold in team_shot_thresholds:
        # Home team odds
        under_odds_home = calculate_odds(over_under_shots_team[home_team][threshold]['Under'])
        over_odds_home = calculate_odds(over_under_shots_team[home_team][threshold]['Over'])

        # Away team odds
        under_odds_away = calculate_odds(over_under_shots_team[away_team][threshold]['Under'])
        over_odds_away = calculate_odds(over_under_shots_team[away_team][threshold]['Over'])

        print(f"\nThreshold {threshold} Shots:")
        print(f"{home_team} - Under {threshold} shots: {under_odds_home:.2f}, Over {threshold} shots: {over_odds_home:.2f}")
        print(f"{away_team} - Under {threshold} shots: {under_odds_away:.2f}, Over {threshold} shots: {over_odds_away:.2f}")

    # Over/Under odds for total shots
    print("\nOver/Under Odds for Total Shots:")
    for threshold in total_shot_thresholds:
        under_odds_total = calculate_odds(over_under_shots_total[threshold]['Under'])
        over_odds_total = calculate_odds(over_under_shots_total[threshold]['Over'])

        print(f"Under {threshold} total shots: {under_odds_total:.2f}, Over {threshold} total shots: {over_odds_total:.2f}")

    # Optional: Plotting the Shot Probability Matrix
    # Uncomment the following lines if you wish to visualize the shot probability matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(m, cmap='Greens', interpolation='nearest')
    plt.colorbar(label='Probability')
    plt.xlabel(f"{away_team} Shots")
    plt.ylabel(f"{home_team} Shots")
    plt.title(f"Shot Probability Matrix: {home_team} vs {away_team}")

    # Display the values in each cell (optional for large matrices, may clutter)
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            if m[i, j] > 0.001:  # Only display significant probabilities
                plt.text(j, i, f"{m[i, j]:.3f}", ha='center', va='center', color="black", fontsize=6)

    plt.show()


    return home_shots_prob, draw_shots_prob, away_shots_prob, over_under_shots_team, over_under_shots_total


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_advantage"]
    rho = params["rho"]

    # Calculate expected goals using exponential to ensure positivity
    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    # Poisson probabilities for goals from 0 to 5
    max_goals = 5
    goal_range = np.arange(0, max_goals + 1)
    home_probs = poisson.pmf(goal_range, home_goal_expectation)
    away_probs = poisson.pmf(goal_range, away_goal_expectation)

    # Goal probability matrix (Home Goals x Away Goals)
    m = np.outer(home_probs, away_probs)
    
    # Adjust for correlation (if rho is used in your model)
    if rho != 0:
        m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
        m[0, 1] *= 1 + home_goal_expectation * rho
        m[1, 0] *= 1 + away_goal_expectation * rho
        m[1, 1] *= 1 - rho    
    
    # Ensure the matrix sums to 1 after adjustments
    m = m / m.sum()

    # Calculate both teams to score (BTTS) probability
    p_home_zero = home_probs[0]       # P(Home = 0)
    p_away_zero = away_probs[0]       # P(Away = 0)
    p_both_score = 1 - (p_home_zero + p_away_zero - m[0, 0])

    # print(f'{home_team} VS {away_team} Cheat Sheet')
    # print('============================================================')
    # print(f'Both Teams To Score (BTTS) Probability: {p_both_score:.4f}')
    # print(f'BTTS Odds: {1/p_both_score:.4f}')

    # Calculate match outcome probabilities
    home_win_prob = np.sum(np.tril(m, -1))
    draw_prob = np.sum(np.diag(m))
    away_win_prob = np.sum(np.triu(m, 1))

    # Calculate probabilities for 'Home Win or Draw' and 'Away Win or Draw'
    home_or_draw_prob = home_win_prob + draw_prob
    away_or_draw_prob = away_win_prob + draw_prob

    # Function to calculate odds
    def calculate_odds(prob):
        epsilon = 1e-8  # To prevent division by zero
        return 1 / (prob + epsilon)

    # Calculate odds for match outcomes
    home_win_odds = calculate_odds(home_win_prob)
    draw_odds = calculate_odds(draw_prob)
    away_win_odds = calculate_odds(away_win_prob)

    # Calculate odds for 'Home Win or Draw' and 'Away Win or Draw'
    home_or_draw_odds = calculate_odds(home_or_draw_prob)
    away_or_draw_odds = calculate_odds(away_or_draw_prob)

    # Display probabilities for match outcomes
    # print("\nMatch Outcome Probabilities:")
    # print(f"Home Win Probability: {home_win_prob:.4f}")
    # print(f"Draw Probability: {draw_prob:.4f}")
    # print(f"Away Win Probability: {away_win_prob:.4f}")
    # print(f"Home Win or Draw Probability: {home_or_draw_prob:.4f}")
    # print(f"Away Win or Draw Probability: {away_or_draw_prob:.4f}")

    # # Display odds for match outcomes
    # print("\nMatch Outcome Odds (Decimal):")
    # print(f"Home Win Odds: {home_win_odds:.2f}")
    # print(f"Draw Odds: {draw_odds:.2f}")
    # print(f"Away Win Odds: {away_win_odds:.2f}")
    # print(f"Home Win or Draw Odds: {home_or_draw_odds:.2f}")
    # print(f"Away Win or Draw Odds: {away_or_draw_odds:.2f}")

    # Calculate total goals probabilities
    max_total_goals = 2 * max_goals  # Maximum possible total goals
    total_goals_probs = np.zeros(max_total_goals + 1)
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            total_goals = i + j
            total_goals_probs[total_goals] += m[i, j]

    # Calculate over/under probabilities for total goals
    def over_under_total_prob(threshold):
        under_prob = np.sum(total_goals_probs[:int(np.floor(threshold)) + 1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    total_thresholds = [1.5, 2.5, 3.5]
    over_under_total_results = {}
    for threshold in total_thresholds:
        under_prob, over_prob = over_under_total_prob(threshold)
        over_under_total_results[threshold] = {
            'Under': under_prob,
            'Over': over_prob
        }
        #print(f"Under {threshold} total goals: {under_prob:.4f}, Over {threshold} total goals: {over_prob:.4f}")

    # Over/Under odds for total goals
    #print("\nOver/Under Total Goals Odds (Decimal):")
    for threshold in total_thresholds:
        under_odds = calculate_odds(over_under_total_results[threshold]['Under'])
        over_odds = calculate_odds(over_under_total_results[threshold]['Over'])
        #print(f"Under {threshold} total goals: {under_odds:.2f}, Over {threshold} total goals: {over_odds:.2f}")

    # Calculate over/under probabilities for individual team goals
    def over_under_team_prob(probs, threshold):
        # Under probability: P(goals <= floor(threshold))
        under_prob = np.sum(probs[:int(np.floor(threshold)) + 1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    team_thresholds = [0.5, 1.5, 2.5, 3.5]
    over_under_team_results = {home_team: {}, away_team: {}}

    for threshold in team_thresholds:
        # Home team over/under
        under_prob_home, over_prob_home = over_under_team_prob(home_probs, threshold)
        over_under_team_results[home_team][threshold] = {
            'Under': under_prob_home,
            'Over': over_prob_home
        }

        # Away team over/under
        under_prob_away, over_prob_away = over_under_team_prob(away_probs, threshold)
        over_under_team_results[away_team][threshold] = {
            'Under': under_prob_away,
            'Over': over_prob_away
        }

    # Over/Under odds for individual team goals
    #print("\nOver/Under Goals Odds per Team (Decimal):")
    for team in [home_team, away_team]:
        #print(f"\n{team}:")
        for threshold in team_thresholds:
            under_prob = over_under_team_results[team][threshold]['Under']
            over_prob = over_under_team_results[team][threshold]['Over']
            under_odds = calculate_odds(under_prob)
            over_odds = calculate_odds(over_prob)
            #print(f"  Under {threshold} goals: {under_odds:.2f}, Over {threshold} goals: {over_odds:.2f}")

    # Calculate probabilities for goal ranges
    goal_ranges = {
        '0-2 Goals': (0, 2),
        '1-3 Goals': (1, 3),
        '1-4 Goals': (1, 4),
        '2-3 Goals': (2, 3),
        '2-4 Goals': (2, 4),
        '4+ Goals': (4, max_total_goals)
    }

    goal_range_probs = {}
    for range_name, (start, end) in goal_ranges.items():
        # For '4+ Goals', include all goals from 4 to max_total_goals
        if end == max_total_goals:
            prob = np.sum(total_goals_probs[start:])
        else:
            prob = np.sum(total_goals_probs[start:end + 1])
        goal_range_probs[range_name] = prob

    # Display goal range probabilities and odds
    #print("\nGoal Range Probabilities and Odds:")
    for range_name, prob in goal_range_probs.items():
        odds = calculate_odds(prob)
        #print(f"{range_name}: Probability = {prob:.4f}, Odds = {odds:.2f}")

    # Return the calculated probabilities and odds as needed
    return {
        'home_win_prob': home_win_prob,
        'draw_prob': draw_prob,
        'away_win_prob': away_win_prob,
        'home_or_draw_prob': home_or_draw_prob,
        'away_or_draw_prob': away_or_draw_prob,
        'home_win_odds': home_win_odds,
        'draw_odds': draw_odds,
        'away_win_odds': away_win_odds,
        'home_or_draw_odds': home_or_draw_odds,
        'away_or_draw_odds': away_or_draw_odds,
        'over_under_total_results': over_under_total_results,
        'over_under_team_results': over_under_team_results,
        'goal_range_probs': goal_range_probs,
        'matrix':m,
    }


In [511]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from collections import defaultdict

# Assume DixonColesGoalModel and predict function are already defined and accept weights

def dc_decay(x, t):
    return np.exp(-x * t)

# Read your dataframe
df = pd.read_csv('europa_data_final.csv')  # Replace with your actual data source

# Convert 'date' column to datetime using the correct format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# Sort the dataframe by date
df = df.sort_values('date')

# Group the dataframe by week and year
df['year_week'] = df['date'].dt.isocalendar().year.astype(str) + '-' + df['date'].dt.isocalendar().week.astype(str).str.zfill(2)

# Create directories to store matrices and next_week_games if they don't exist
os.makedirs('matrices', exist_ok=True)
os.makedirs('next_week_games', exist_ok=True)  # New directory for next_week_games CSV files

# Get the list of year_week periods
weeks = sorted(df['year_week'].unique())

# Iterate over the weeks
for i in range(len(weeks) - 1):
    current_week = weeks[i]
    next_week = weeks[i + 1]
    
    print(f"\nProcessing Week: {current_week}")
    
    # Get data up to the current week (inclusive)
    cumulative_data = df[df['year_week'] <= current_week].copy()
    
    # Calculate 'days_since' and 'weight'
    max_date = cumulative_data['date'].max()
    cumulative_data['days_since'] = (max_date - cumulative_data['date']).dt.days
    cumulative_data['weight'] = dc_decay(0.02, cumulative_data['days_since'])
    
    # Prepare data for the model
    goals_home = cumulative_data['homeScore']
    goals_away = cumulative_data['awayScore']
    teams_home = cumulative_data['homeTeam']
    teams_away = cumulative_data['awayTeam']
    weights = cumulative_data['weight']
    
    # Initialize and fit the model with weights
    model = DixonColesGoalModel(goals_home, goals_away, teams_home, teams_away, weights=weights)
    model.fit()
    
    # Get the parameters
    params = model.get_params()
    
    # Get games for the next week
    next_week_games = df[df['year_week'] == next_week].copy()
    
    # Check if there are games in the next week
    if next_week_games.empty:
        print(f"No games found for week: {next_week}")
        continue
    
    # Add attack and defense parameters to the teams in the next week's games
    def get_attack_param(team):
        return params.get(f"attack_{team}", np.nan)
    
    def get_defense_param(team):
        return params.get(f"defence_{team}", np.nan)
    
    next_week_games['home_attack_param'] = next_week_games['homeTeam'].apply(get_attack_param)
    next_week_games['home_defense_param'] = next_week_games['homeTeam'].apply(get_defense_param)
    next_week_games['away_attack_param'] = next_week_games['awayTeam'].apply(get_attack_param)
    next_week_games['away_defense_param'] = next_week_games['awayTeam'].apply(get_defense_param)
    
    # Identify games with uninitialized teams
    skipped_games = next_week_games[next_week_games.isna().any(axis=1)]
    for idx, game in skipped_games.iterrows():
        print(f"Skipping Event ID {game['eventId']} due to uninitialized team(s)")
    
    # Drop games where any attack or defense parameter is NaN
    valid_games = next_week_games.dropna(subset=[
        'home_attack_param', 'home_defense_param',
        'away_attack_param', 'away_defense_param'
    ])
    
    # Iterate over the valid games
    for idx, game in valid_games.iterrows():
        home_team = game['homeTeam']
        away_team = game['awayTeam']
        event_id = game['eventId']
        
        # Use the predict function
        prediction = predict(params, home_team, away_team)
        
        # Store the matrix in a file named after the eventId
        matrix = prediction['matrix']
        matrix_file = f"matrices/{event_id}.csv"
        np.savetxt(matrix_file, matrix, delimiter=',')
        
        print(f"Stored prediction matrix for Event ID {event_id}")
    
    # Save the updated next_week_games with parameters in the 'next_week_games' directory
    next_week_games_file = f"next_week_games/next_week_games_{next_week}.csv"
    next_week_games.to_csv(next_week_games_file, index=False)



Processing Week: 2023-08
Skipping Event ID 10952495 due to uninitialized team(s)
Skipping Event ID 10952498 due to uninitialized team(s)
Skipping Event ID 10952494 due to uninitialized team(s)
Skipping Event ID 10952486 due to uninitialized team(s)
Skipping Event ID 10952481 due to uninitialized team(s)
Skipping Event ID 10952480 due to uninitialized team(s)
Skipping Event ID 10952483 due to uninitialized team(s)
Skipping Event ID 10952485 due to uninitialized team(s)
Skipping Event ID 10952488 due to uninitialized team(s)
Skipping Event ID 10952484 due to uninitialized team(s)
Skipping Event ID 10952492 due to uninitialized team(s)
Skipping Event ID 10952489 due to uninitialized team(s)
Skipping Event ID 10952490 due to uninitialized team(s)
Skipping Event ID 10952493 due to uninitialized team(s)
Skipping Event ID 10952491 due to uninitialized team(s)
Skipping Event ID 10952487 due to uninitialized team(s)
Stored prediction matrix for Event ID 10952481
Stored prediction matrix for Ev

In [None]:
params

In [None]:
predict(params, "Chelsea", "Arsenal")
# predict_ST(sot_params, "Bayern Munich", "Benfica")
#predict(params, "PSV Eindhoven", "Girona")
# predict(params, "Lille", "Juventus")
# predict(params, "Lille", "Juventus")
# # predict(params, "Lille", "Juventus")
# predict_(params, "Celtic", "RB Leipzig")
# dc_sot.predict_shots_on_target("Celtic", "RB Leipzig")

In [None]:
def calculate_o25_probability_independent(row, model_params):
    home_team = row['homeTeam']
    away_team = row['awayTeam']
    
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_advantage"]
    rho = params["rho"]

    # Calculate expected goals using exponential to ensure positivity
    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    # Poisson probabilities for goals from 0 to 5
    max_goals = 5
    goal_range = np.arange(0, max_goals + 1)
    home_probs = poisson.pmf(goal_range, home_goal_expectation)
    away_probs = poisson.pmf(goal_range, away_goal_expectation)

    # Goal probability matrix (Home Goals x Away Goals)
    m = np.outer(home_probs, away_probs)
    
    # Adjust for correlation (if rho is used in your model)
    if rho != 0:
        m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
        m[0, 1] *= 1 + home_goal_expectation * rho
        m[1, 0] *= 1 + away_goal_expectation * rho
        m[1, 1] *= 1 - rho    
        
    max_total_goals = 2 * max_goals  # Maximum possible total goals
    total_goals_probs = np.zeros(max_total_goals + 1)
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            total_goals = i + j
            total_goals_probs[total_goals] += m[i, j]
            
    under_prob = np.sum(total_goals_probs[:3]) 
    over_prob = 1 - under_prob 
    return over_prob


In [None]:
test['O25_Prob'] = test.apply(lambda row: calculate_o25_probability_independent(row, params), axis=1)
test['O25_Prob']

In [None]:
o25_threshold = 0.6
# Independent Poisson Backtesting
high_o25_matches = test[test['O25_Prob'] > o25_threshold]
high_o25_matches['Actual_O25'] = (high_o25_matches['homeScore'] + high_o25_matches['awayScore']) > 2

In [None]:
num_predictions = len(high_o25_matches)
num_correct = high_o25_matches['Actual_O25'].sum()
success_rate = num_correct / num_predictions if num_predictions > 0 else 0

print(f"\nTotal Predictions (O2.5 Prob > {o25_threshold}): {num_predictions}")
print(f"Correct Predictions (Over 2.5 Goals): {num_correct}")
print(f"Success Rate: {success_rate:.2%}")
high_o25_matches[['O25_Prob','Actual_O25','homeTeam','homeScore','awayTeam','awayScore']] 

In [None]:
def calculate_u25_probability_independent(row, model_params):
    home_team = row['homeTeam']
    away_team = row['awayTeam']
    
    # Extract model parameters
    attack_home = model_params["attack_" + home_team]
    defence_home = model_params["defence_" + home_team]
    attack_away = model_params["attack_" + away_team]
    defence_away = model_params["defence_" + away_team]
    home_advantage = model_params["home_advantage"]
    
    # Calculate expected goals
    lambda_home = np.exp(attack_home + defence_away + home_advantage)
    lambda_away = np.exp(attack_away + defence_home)
    
    # Sum Poisson parameters
    lambda_total = lambda_home + lambda_away
    
    # Probability of total goals <=2
    p_total_leq_2 = poisson.cdf(2, lambda_total)
    
    # Return probability of under 2.5 goals
    return p_total_leq_2


In [None]:
test['U25_Prob'] = test.apply(lambda row: calculate_u25_probability_independent(row, params), axis=1)
test['U25_Prob']

In [None]:
o25_threshold = 0.65
# Independent Poisson Backtesting
high_o25_matches = test[test['U25_Prob'] > o25_threshold]
high_o25_matches['Actual_U25'] = (high_o25_matches['homeScore'] + high_o25_matches['awayScore']) <= 2

In [None]:
num_predictions = len(high_o25_matches)
num_correct = high_o25_matches['Actual_U25'].sum()
success_rate = num_correct / num_predictions if num_predictions > 0 else 0

print(f"\nTotal Predictions (U2.5 Prob > {o25_threshold}): {num_predictions}")
print(f"Correct Predictions (Under 2.5 Goals): {num_correct}")
print(f"Success Rate: {success_rate:.2%}")
high_o25_matches[['U25_Prob','Actual_U25','homeScore','awayScore']]


In [None]:

def evaluate_model(matches_df, params, confidence_threshold=0.5):
    correct_predictions = 0
    total_matches = 0  # Count of matches considered with confidence above the threshold
    threshold_hits = 0  # Count of times the confidence threshold was met
    
    # For Log Loss
    log_loss = 0
    epsilon = 1e-15  # To prevent log(0)
    
    for index, row in matches_df.iterrows():
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        actual_home_shots = row['homeScore']
        actual_away_shots = row['awayScore']
        
        # Get predicted probabilities
        home_prob, draw_prob, away_prob, _, _, _ ,_= predict(params, home_team, away_team)
        
        # Get the maximum probability for the predicted outcome
        max_prob = max(home_prob, draw_prob, away_prob)
        
        
        # Only consider predictions with confidence above the threshold
        if max_prob >= confidence_threshold:
            threshold_hits += 1  # Increment the threshold hit count
            total_matches += 1
            
            # Determine predicted outcome based on highest probability
            if home_prob == max_prob:
                predicted_outcome = 'Home'
            elif away_prob == max_prob:
                predicted_outcome = 'Away'
            else:
                predicted_outcome = 'Draw'
            
            # Determine actual outcome
            if actual_home_shots > actual_away_shots:
                actual_outcome = 'Home'
            elif actual_away_shots > actual_home_shots:
                actual_outcome = 'Away'
            else:
                actual_outcome = 'Draw'
            
            # Check if prediction is correct
            if predicted_outcome == actual_outcome:
                correct_predictions += 1
            
            # Update Log Loss based on the actual outcome
            if actual_outcome == 'Home':
                log_loss += -np.log(home_prob + epsilon)
            elif actual_outcome == 'Draw':
                log_loss += -np.log(draw_prob + epsilon)
            else:
                log_loss += -np.log(away_prob + epsilon)
    
    # Calculate Metrics
    if total_matches > 0:
        accuracy = correct_predictions / total_matches
        average_log_loss = log_loss / total_matches
        print(f"Accuracy (for confidence >= {confidence_threshold}): {accuracy:.2%}")
        print(f"Average Log Loss (for confidence >= {confidence_threshold}): {average_log_loss:.4f}")
    else:
        print(f"No predictions met the confidence threshold of {confidence_threshold}")
        accuracy = None
        average_log_loss = None
    
    print(f"Threshold hits (confidence >= {confidence_threshold}): {threshold_hits}")
    
    return accuracy, average_log_loss, threshold_hits

# Example usage:
# matches_df = pd.read_csv('historical_matches.csv')  # Ensure your CSV has the required columns
# evaluate_model(matches_df, params, confidence_threshold=0.7)


In [None]:
test = df.iloc[-466:]


In [None]:
evaluate_model(test,params,0.70)

In [None]:
def evaluate_over_under(matches_df, params, thresholds):
    # Initialize a dictionary to store results for each threshold
    results = {threshold: {'correct_predictions': 0, 'total_predictions': 0} for threshold in thresholds}
    
    for index, row in matches_df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        actual_total_goals = row['FTHG'] + row['FTAG']
        
        # Get predicted probabilities
        _, _, _, over_under_total_results, _, _ = predict(params, home_team, away_team)
        
        # Get the probability for Over 2.5 Goals
        over_2_5_prob = over_under_total_results[2.5]['Over']
        
        # Iterate over each confidence threshold
        for threshold in thresholds:
            if over_2_5_prob >= threshold:
                results[threshold]['total_predictions'] += 1
                predicted_outcome = 'Over'
                
                # Determine actual outcome
                actual_outcome = 'Over' if actual_total_goals > 2.5 else 'Under'
                
                # Check if prediction is correct
                if predicted_outcome == actual_outcome:
                    results[threshold]['correct_predictions'] += 1
    
    # Calculate and print results for each threshold
    for threshold in thresholds:
        total = results[threshold]['total_predictions']
        correct = results[threshold]['correct_predictions']
        if total > 0:
            accuracy = correct / total
            print(f"\nConfidence Threshold >= {threshold}:")
            print(f"Total Predictions Made: {total}")
            print(f"Correct Predictions: {correct}")
            print(f"Accuracy: {accuracy:.2%}")
        else:
            print(f"\nConfidence Threshold >= {threshold}: No predictions made.")
    
    return results

# Example usage:
# Define your confidence thresholds
confidence_thresholds = [0.5,0.6,0.7]

# Assuming you have a DataFrame 'matches_df' with the necessary columns
# matches_df = pd.read_csv('historical_matches.csv')  # Ensure your CSV has 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG'

# Call the evaluation function
results = evaluate_over_under(test, params, confidence_thresholds)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson

def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_advantage"]
    rho = params["rho"]

    # Calculate expected goals using exponential to ensure positivity
    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    # Poisson probabilities for goals from 0 to 9
    max_goals = 9
    goal_range = np.arange(0, max_goals + 1)
    home_probs = poisson.pmf(goal_range, home_goal_expectation)
    away_probs = poisson.pmf(goal_range, away_goal_expectation)

    # Goal probability matrix (Home Goals x Away Goals)
    m = np.outer(home_probs, away_probs)
    
    # Adjust for correlation (if rho is used in your model)
    if rho != 0:
        m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
        m[0, 1] *= 1 + home_goal_expectation * rho
        m[1, 0] *= 1 + away_goal_expectation * rho
        m[1, 1] *= 1 - rho    
    
    # Ensure the matrix sums to 1 after adjustments
    m = m / m.sum()
    # print(home_probs)
    # # Calculate both teams to score (BTTS) probability
    p_home_zero = home_probs[0]       # P(Home = 0)
    p_away_zero = away_probs[0]       # P(Away = 0)
    p_both_score = 1 - (p_home_zero + p_away_zero - m[0, 0])
    # print(f'{home_team} VS {away_team} Cheat Sheet')
    # print('============================================================')
    # print(f'Both Teams To Score (BTTS) Probability: {p_both_score:.4f}')
    # print(f'BTTS Odds: {1/p_both_score:.4f}')

    # # Optional: Plotting the goal probability matrix
    
    # plt.figure(figsize=(8, 6))
    # plt.imshow(m, cmap='Blues', interpolation='nearest')
    # plt.colorbar(label='Probability')
    # plt.xlabel("Away Goals")
    # plt.ylabel("Home Goals")
    # plt.title("Goal Probability Matrix (m)")

    # Display the values in each cell
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            plt.text(j, i, f"{m[i, j]:.2f}", ha='center', va='center', color="black")

    # plt.show()
    

    # Calculate match outcome probabilities
    home_win_prob = np.sum(np.tril(m, -1))
    draw_prob = np.sum(np.diag(m))
    away_win_prob = np.sum(np.triu(m, 1))

    # Display probabilities for match outcomes
    # print("\nMatch Outcome Probabilities:")
    # print(f"Home Win Probability: {home_win_prob:.4f}")
    # print(f"Draw Probability: {draw_prob:.4f}")
    # print(f"Away Win Probability: {away_win_prob:.4f}")

    # Calculate total goals probabilities
    max_total_goals = 2 * max_goals  # Maximum possible total goals
    total_goals_probs = np.zeros(max_total_goals + 1)
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            total_goals = i + j
            total_goals_probs[total_goals] += m[i, j]

    # Calculate over/under probabilities for total goals
    def over_under_total_prob(threshold):
        under_prob = np.sum(total_goals_probs[:int(np.floor(threshold)) + 1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    total_thresholds = [1.5, 2.5, 3.5]
    over_under_total_results = {}
    for threshold in total_thresholds:
        under_prob, over_prob = over_under_total_prob(threshold)
        over_under_total_results[threshold] = {
            'Under': under_prob,
            'Over': over_prob
        }
        # print(f"Under {threshold} total goals: {under_prob:.4f}, Over {threshold} total goals: {over_prob:.4f}")

    # Calculate over/under probabilities for individual team goals
    def over_under_team_prob(probs, threshold):
        # Under probability: P(goals <= floor(threshold))
        under_prob = np.sum(probs[:int(np.floor(threshold)) + 1])
        over_prob = 1 - under_prob
        return under_prob, over_prob

    team_thresholds = [0.5, 1.5, 2.5, 3.5]
    over_under_team_results = {home_team: {}, away_team: {}}

    for threshold in team_thresholds:
        # Home team over/under
        under_prob_home, over_prob_home = over_under_team_prob(home_probs, threshold)
        over_under_team_results[home_team][threshold] = {
            'Under': under_prob_home,
            'Over': over_prob_home
        }

        # Away team over/under
        under_prob_away, over_prob_away = over_under_team_prob(away_probs, threshold)
        over_under_team_results[away_team][threshold] = {
            'Under': under_prob_away,
            'Over': over_prob_away
        }

    # Calculate probabilities for goal ranges
    goal_ranges = {
        '0-2 Goals': (0, 2),
        '1-3 Goals': (1, 3),
        '1-4 Goals': (1, 4),
        '2-3 Goals': (2, 3),
        '2-4 Goals': (2, 4),
        '4+ Goals': (4, max_total_goals)
    }

    goal_range_probs = {}
    for range_name, (start, end) in goal_ranges.items():
        # For '4+ Goals', we need to include all goals from 4 to max_total_goals
        if end == max_total_goals:
            prob = np.sum(total_goals_probs[start:])
        else:
            prob = np.sum(total_goals_probs[start:end + 1])
        goal_range_probs[range_name] = prob

    # Display goal range probabilities and odds
    #print("\nGoal Range Probabilities and Odds:")
    for range_name, prob in goal_range_probs.items():
        odds = 1 / (prob + 1e-8)  # To prevent division by zero
        #print(f"{range_name}: Probability = {prob:.4f}, Odds = {odds:.2f}")

    # Calculate Odds
    def calculate_odds(prob):
        epsilon = 1e-8  # To prevent division by zero
        return 1 / (prob + epsilon)

    # Match outcome odds
    home_win_odds = calculate_odds(home_win_prob)
    draw_odds = calculate_odds(draw_prob)
    away_win_odds = calculate_odds(away_win_prob)

    # print("\nMatch Outcome Odds (Decimal):")
    # print(f"Home Win: {home_win_odds:.2f}")
    # print(f"Draw: {draw_odds:.2f}")
    # print(f"Away Win: {away_win_odds:.2f}")

    # Over/Under odds for total goals
    #print("\nOver/Under Total Goals Odds (Decimal):")
    for threshold in total_thresholds:
        under_odds = calculate_odds(over_under_total_results[threshold]['Under'])
        over_odds = calculate_odds(over_under_total_results[threshold]['Over'])
        #print(f"Under {threshold} total goals: {under_odds:.2f}, Over {threshold} total goals: {over_odds:.2f}")

    # Over/Under odds for individual team goals
    #print("\nOver/Under Goals Odds per Team (Decimal):")
    for team in [home_team, away_team]:
        #print(f"\n{team}:")
        for threshold in team_thresholds:
            under_prob = over_under_team_results[team][threshold]['Under']
            over_prob = over_under_team_results[team][threshold]['Over']
            under_odds = calculate_odds(under_prob)
            over_odds = calculate_odds(over_prob)
            #print(f"  Under {threshold} goals: {under_odds:.2f}, Over {threshold} goals: {over_odds:.2f}")

    # Add p_both_score to the return statement
    
    
    
    return home_win_prob, draw_prob, away_win_prob, over_under_total_results, over_under_team_results, goal_range_probs, p_both_score


In [None]:

def evaluate_btts(matches_df, params, thresholds):
    # Initialize a dictionary to store results for each threshold
    results = {threshold: {'correct_predictions': 0, 'total_predictions': 0} for threshold in thresholds}
    
    for index, row in matches_df.iterrows():
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        actual_home_goals = row['homeScore']
        actual_away_goals = row['awayScore']
        
        # Get predicted probabilities
        # Use the existing predict function and extract p_both_score
        try:
            _, _, _, _, _, _, p_both_score = predict(params, home_team, away_team)
        except KeyError as e:
            print(f"Team {e} not found in parameters. Skipping this match.")
            continue

        # Iterate over each confidence threshold
        for threshold in thresholds:
            if p_both_score <= threshold:
                results[threshold]['total_predictions'] += 1
                predicted_outcome = 'Yes'  # Predicted BTTS to happen

                # Determine actual outcome
                actual_outcome = 'Yes' if (actual_home_goals > 0 and actual_away_goals > 0) else 'No'

                # Check if prediction is correct
                if predicted_outcome == actual_outcome:
                    results[threshold]['correct_predictions'] += 1

    # Calculate and print results for each threshold
    for threshold in thresholds:
        total = results[threshold]['total_predictions']
        correct = results[threshold]['correct_predictions']
        if total > 0:
            accuracy = correct / total
            print(f"\nConfidence Threshold >= {threshold}:")
            print(f"Total Predictions Made: {total}")
            print(f"Correct Predictions: {correct}")
            print(f"Accuracy: {accuracy:.2%}")
        else:
            print(f"\nConfidence Threshold >= {threshold}: No predictions made.")

    return results

In [None]:
confidence_thresholds = [0.9]
results = evaluate_btts(test, params, confidence_thresholds)

In [None]:
import numpy as np
import pandas as pd

def evaluate_model_with_combined_draw(matches_df, params, confidence_threshold=0.5):
    correct_predictions = 0
    total_matches = 0  # Count of matches considered with confidence above the threshold
    threshold_hits = 0  # Count of times the confidence threshold was met
    
    # For Log Loss
    log_loss = 0
    epsilon = 1e-15  # To prevent log(0)
    
    home_hits = 0
    away_hits = 0
    draw_hits = 0
    
    for index, row in matches_df.iterrows():
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        actual_home_shots = row['homeScore']
        actual_away_shots = row['awayScore']
        
        # Get predicted probabilities
        home_prob, draw_prob, away_prob, _, _, _,_ = predict(params, home_team, away_team)
        
        # Determine max probability and combine it with draw if it's not already draw
        if home_prob > away_prob and home_prob > draw_prob:
            combined_prob = home_prob + draw_prob
            max_prob = combined_prob
            predicted_outcome = 'Home'
        elif away_prob > home_prob and away_prob > draw_prob:
            combined_prob = away_prob + draw_prob
            max_prob = combined_prob
            predicted_outcome = 'Away'
        else:
            max_prob = draw_prob
            predicted_outcome = 'Draw'
        
        # Only consider predictions with confidence above the threshold
        if max_prob >= confidence_threshold:
            threshold_hits += 1  # Increment the threshold hit count
            total_matches += 1
            
            # Determine actual outcome
            if actual_home_shots > actual_away_shots:
                actual_outcome = 'Home'
            elif actual_away_shots > actual_home_shots:
                actual_outcome = 'Away'
            else:
                actual_outcome = 'Draw'
            
            # Check if prediction is correct
            if predicted_outcome == actual_outcome:
                correct_predictions += 1
            
            # Update Log Loss based on the actual outcome
            if actual_outcome == 'Home':
                log_loss += -np.log(home_prob + epsilon)
                home_hits += 1
            elif actual_outcome == 'Draw':
                log_loss += -np.log(draw_prob + epsilon)
                draw_hits += 1
            else:
                log_loss += -np.log(away_prob + epsilon)
                away_hits += 1
    
    # Calculate Metrics
    if total_matches > 0:
        accuracy = correct_predictions / total_matches
        average_log_loss = log_loss / total_matches
        print(f"Accuracy (for confidence >= {confidence_threshold}): {accuracy:.2%}")
        print(f"Average Log Loss (for confidence >= {confidence_threshold}): {average_log_loss:.4f}")
    else:
        print(f"No predictions met the confidence threshold of {confidence_threshold}")
        accuracy = None
        average_log_loss = None
    
    print(f"Threshold hits (confidence >= {confidence_threshold}): {threshold_hits}")
    print(f"Home hits: {home_hits}, Away hits: {away_hits}, Draw hits: {draw_hits}")
    
    return accuracy, average_log_loss, threshold_hits, home_hits, away_hits, draw_hits

# Example usage:
# matches_df = pd.read_csv('historical_matches.csv')  # Ensure your CSV has the required columns
# evaluate_model_with_combined_draw(matches_df, params, confidence_threshold=0.7)


In [None]:
evaluate_model_with_combined_draw(test, params, confidence_threshold=0.92)

In [None]:

# Assuming you have a DataFrame 'matches_df' with historical match data
# Columns: 'HomeTeam', 'AwayTeam', 'HomeShots', 'AwayShots'

def evaluate_model(matches_df, params):
    correct_predictions = 0
    total_matches = len(matches_df)
    
    # For Brier Score
    brier_home = []
    brier_draw = []
    brier_away = []
    
    # For Log Loss
    log_loss = 0
    epsilon = 1e-15  # To prevent log(0)
    
    for index, row in matches_df.iterrows():
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        actual_home_shots = row['homeScore']
        actual_away_shots = row['awayScore']
        
        # Get predicted probabilities
        home_prob, draw_prob, away_prob ,_,_,_,_= predict(params, home_team, away_team)
        
        # Determine predicted outcome
        if home_prob > away_prob and home_prob > draw_prob:
            predicted_outcome = 'Home'
            print(home_prob)
        elif away_prob > home_prob and away_prob > draw_prob:
            predicted_outcome = 'Away'
            print(away_prob)
        else:
            
            predicted_outcome = 'Draw'
        
        # Determine actual outcome
        if actual_home_shots > actual_away_shots:
            actual_outcome = 'Home'
        elif actual_away_shots > actual_home_shots:
            actual_outcome = 'Away'
        else:
            actual_outcome = 'Draw'
        
        # Check if prediction is correct
        if predicted_outcome == actual_outcome:
            correct_predictions += 1
        
        
        # Update Log Loss
        if actual_outcome == 'Home':
            log_loss += -np.log(home_prob + epsilon)
        elif actual_outcome == 'Draw':
            log_loss += -np.log(draw_prob + epsilon)
        else:
            log_loss += -np.log(away_prob + epsilon)
    
    # Calculate Metrics
    accuracy = correct_predictions / total_matches
    average_log_loss = log_loss / total_matches
    
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Average Log Loss: {average_log_loss:.4f}")
    
    return accuracy, average_log_loss

# Example usage:
# matches_df = pd.read_csv('historical_matches.csv')  # Ensure your CSV has the required columns
# evaluate_model(matches_df, params)


In [None]:
evaluate_model(test,params)

In [None]:
import json
from botasaurus.request import request, Request
from botasaurus_requests import response
DATE = '2024-11-10'  # You can change this as needed


# Initialize the Botasaurus session
@request(output=None, create_error_logs=False)
def _botasaurus_get(request: Request, url: str) -> response.Response:
    """SofaScore introduced some anti-scraping measures. Using Botasaurus gets around them."""
    if not isinstance(url, str):
        raise TypeError('`url` must be a string.')
    response = request.get(url)
    return response



In [None]:
def fetch_scheduled_events(date):
    url = f'https://www.sofascore.com/api/v1/sport/football/scheduled-events/{date}'
    response = _botasaurus_get(url)
    if response.status_code == 200:
        print(response.json())
        return response.json()
    else:
        print(f"Failed to fetch events for {date}")
        return None

scheduled_events = fetch_scheduled_events(DATE)


In [None]:
from datetime import datetime  # For working with dates and times
# Prepare a list to store event details
teams_list = []

if scheduled_events:
    events = scheduled_events.get('events', [])
    for event in events:
        unique_tournament = event.get('tournament', {}).get('uniqueTournament', {})
        unique_tournament_id = unique_tournament.get('id')
        unique_tournament_name = unique_tournament.get('name')
        if unique_tournament_id in LEAGUES.values():
            event_id = event.get('id')
            tournament_name = unique_tournament.get('name')
            home_team_name = event.get('homeTeam', {}).get('name')
            away_team_name = event.get('awayTeam', {}).get('name')
            start_timestamp = event.get("startTimestamp")
            home_score = event.get('homeScore', {}).get('display', 0)
            away_score = event.get('awayScore', {}).get('display', 0)
            
            if start_timestamp:
                match_datetime = datetime.fromtimestamp(start_timestamp)
                match_date = match_datetime.strftime("%Y-%m-%d")
                match_time = match_datetime.strftime("%H:%M:%S")
            else:
                match_date = None
                match_time = None

            if match_date == DATE:

                teams_list.append({
                    'eventId':event_id,
                    'home_team_name': home_team_name,
                    'away_team_name': away_team_name,
                    
                })
else:
    print("No events found.")

In [None]:
prediction_df = pd.DataFrame(teams_list)

In [None]:
predict(params, 'Real Sociedad','Barcelona')

In [None]:
prediction_df

In [None]:
import pandas as pd

# Assuming 'prediction_df' is your DataFrame with 'home_team_name' and 'away_team_name' columns
# Also, 'params' is your parameters dictionary required by the 'predict' function

# Create an empty list to store the results
results = []

# Iterate over each game in the prediction DataFrame
for idx, row in prediction_df.iterrows():
    event_id = row['eventId']
    home_team = row['home_team_name']
    away_team = row['away_team_name']
    
    # Call your predict function
    (home_win_prob, draw_prob, away_win_prob,
     over_under_total_results, over_under_team_results,
     goal_range_probs, p_both_score) = predict(params, home_team, away_team)
    
    # Prepare a dictionary to store the results
    result = {
        'eventId': event_id,
        'home_team_name': home_team,
        'away_team_name': away_team,
        'home_win_prob': home_win_prob,
        'draw_prob': draw_prob,
        'away_win_prob': away_win_prob,
        'p_both_score': p_both_score
    }
    
    # Add Over/Under probabilities for the thresholds
    thresholds = [1.5, 2.5, 3.5]
    for threshold in thresholds:
        under_prob = over_under_total_results[threshold]['Under']
        over_prob = over_under_total_results[threshold]['Over']
        result[f'U{threshold}'] = under_prob
        result[f'O{threshold}'] = over_prob
    
    # Append the result to the list
    results.append(result)

# Create a new DataFrame from the results
new_prediction_df = pd.DataFrame(results)

# Display the new DataFrame
print(new_prediction_df)


In [None]:
new_prediction_df

In [None]:
# Calculate the maximum win probability and corresponding outcome for each match
def get_winner(row):
    probs = {
        'Home Win': row['home_win_prob'],
        'Draw': row['draw_prob'],
        'Away Win': row['away_win_prob']
    }
    # Find the outcome with the highest probability
    winner_outcome = max(probs, key=probs.get)
    winner_prob = probs[winner_outcome]
    return pd.Series([winner_outcome, winner_prob], index=['winner_outcome', 'winner_prob'])

# Apply the function to each row
new_prediction_df[['winner_outcome', 'winner_prob']] = new_prediction_df.apply(get_winner, axis=1)

# Create DataFrame sorted by winner probability
df_sorted_by_winner = new_prediction_df.sort_values(by='winner_prob', ascending=False)

# Create DataFrame sorted by Over 2.5 probability
df_sorted_by_over25 = new_prediction_df.sort_values(by='O2.5', ascending=False)

# Create DataFrame sorted by Under 2.5 probability
df_sorted_by_under25 = new_prediction_df.sort_values(by='U2.5', ascending=False)

df_sorted_btts = new_prediction_df.sort_values(by='p_both_score', ascending=False)



In [None]:
predict(params,"Liverpool","Aston Villa")

In [None]:
df_sorted_by_winner

In [None]:
df_sorted_by_over25

In [None]:
df_sorted_by_under25

In [None]:
df_sorted_btts

In [None]:
import pandas as pd
import time
import logging
from datetime import datetime
from tqdm import tqdm
import os
import json
from botasaurus.request import request, Request
from botasaurus_requests import response
import ScraperFC as sfc


ss = sfc.Sofascore()

DATE = '2024-11-09'  # You can change this as needed


# Initialize the Botasaurus session
@request(output=None, create_error_logs=False)
def _botasaurus_get(request: Request, url: str) -> response.Response:
    """SofaScore introduced some anti-scraping measures. Using Botasaurus gets around them."""
    if not isinstance(url, str):
        raise TypeError('`url` must be a string.')
    response = request.get(url)
    return response


def fetch_scheduled_events(date):
    url = f'https://www.sofascore.com/api/v1/sport/football/scheduled-events/{date}'
    response = _botasaurus_get(url)
    if response.status_code == 200:
        print(response.json())
        return response.json()
    else:
        print(f"Failed to fetch events for {date}")
        return None

scheduled_events = fetch_scheduled_events(DATE)


def filter_events_by_leagues(scheduled_events, leagues):
    """
    Filters events to only include those whose uniqueTournament ID is in the LEAGUES dictionary.

    Parameters:
    ----------
    scheduled_events : dict
        The dictionary containing all scheduled events.
    
    leagues : dict
        A dictionary of league names and their corresponding uniqueTournament IDs.
    
    Returns:
    -------
    list
        A filtered list of events.
    """
    all_events = scheduled_events.get('events', [])
    filtered_events = []

    for event in all_events:
        unique_tournament = event.get('tournament', {}).get('uniqueTournament', {})
        unique_tournament_id = unique_tournament.get('id')
        
        if unique_tournament_id in leagues.values():
            filtered_events.append(event)

    return filtered_events



def process_and_append_events(scheduled_events, csv_filename='europa_data_final.csv'):
    matches_data = []
    todays_games = []  # List to store today's games for comparison
    
    for event in tqdm(scheduled_events, desc="Processing events", unit="event"):
        try:
            event_id = event.get("id", None)
            home_team = event.get('homeTeam', {}).get('name', 'Unknown')
            home_team_id = event.get('homeTeam', {}).get('id', 'Unknown')
            away_team = event.get('awayTeam', {}).get('name', 'Unknown')
            away_team_id = event.get('awayTeam', {}).get('id', 'Unknown')
            home_score = event.get('homeScore', {}).get('display', 0)
            away_score = event.get('awayScore', {}).get('display', 0)

            start_timestamp = event.get("startTimestamp")
            if start_timestamp:
                match_datetime = datetime.fromtimestamp(start_timestamp)
                match_date = match_datetime.strftime("%Y-%m-%d")
                match_time = match_datetime.strftime("%H:%M:%S")
            else:
                match_date = None
                match_time = None

            gd = home_score - away_score
            if gd == 0:
                FTR = 1
            elif gd > 0:
                FTR = 0
            else:
                FTR = 2

            # Simulating match data scraping (replace this with the actual function call)
            match_data = ss.scrape_team_match_stats(event_id)
            match_data_filtered = match_data[['key', 'home', 'away']]

            new_row = {
                "eventId": event_id,
                "homeId": home_team_id,
                "homeTeam": home_team,
                "awayId": away_team_id,
                "awayTeam": away_team,
                "homeScore": home_score,
                "awayScore": away_score,
                "FTR": FTR,
                "date": match_date,
                "time": match_time
            }

            for index, row in match_data_filtered.iterrows():
                key = row['key']
                new_row[f'home{key}'] = row['home']
                new_row[f'away{key}'] = row['away']

            # Store today's games for comparison
            if match_date == DATE:
                todays_games.append(new_row)

            # Append data to CSV
            df = pd.DataFrame([new_row])
            if os.path.exists(csv_filename):
                df.to_csv(csv_filename, mode='a', index=False, header=False)
            else:
                df.to_csv(csv_filename, mode='w', index=False, header=True)

        except Exception as e:
            logging.error(f"Error processing event {event.get('id', 'Unknown')}: {e}", exc_info=True)
            continue  # Skip to the next event in case of an error

        time.sleep(1)

    # Create DataFrame of today's games
    todays_df = pd.DataFrame(todays_games)
    return todays_df


# Usage example
if scheduled_events:
    filtered_events = filter_events_by_leagues(scheduled_events, LEAGUES)
    todays_games_df = process_and_append_events(filtered_events)  # Collect today's games
    print("Today's games for comparison:")
else:
    print("No events available to process.")

    
# Fetch scheduled events
scheduled_events = fetch_scheduled_events(DATE)

if scheduled_events:
    filtered_events = filter_events_by_leagues(scheduled_events, LEAGUES)
    process_and_append_events(filtered_events)  # Pass the filtered events to your function
else:
    print("No events available to process.")

In [None]:
todays_games_df

In [None]:
def evaluate_predictions(predictions_df, actuals_df):
    """
    Evaluates the prediction accuracy for full-time results, thresholds (O/U), and BTTS.
    
    Parameters:
    ----------
    predictions_df : pd.DataFrame
        DataFrame containing prediction probabilities for each match, including `eventId`.
    
    actuals_df : pd.DataFrame
        DataFrame containing actual match outcomes, including `eventId`.
    
    Returns:
    -------
    dict
        Evaluation results for full-time result accuracy, thresholds (O/U), and BTTS.
    """
    correct_results = 0
    total_predictions = len(predictions_df)
    
    over_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    over_count = {1.5: 0, 2.5: 0, 3.5: 0}
    
    under_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    under_count = {1.5: 0, 2.5: 0, 3.5: 0}
    
    btts_hits = 0
    btts_count = 0
    unmatched_predictions = 0

    for idx, row in predictions_df.iterrows():
        event_id = row['eventId']
        
        # Find corresponding actual match
        actual_match = actuals_df[actuals_df['eventId'] == event_id]

        if actual_match.empty:
            unmatched_predictions += 1
            logging.warning(f"No actual data found for eventId {event_id}. Skipping.")
            continue
        
        actual_match = actual_match.iloc[0]
        
        actual_home_score = actual_match['homeScore']
        actual_away_score = actual_match['awayScore']
        
        # Determine actual full-time result (0: Home Win, 1: Draw, 2: Away Win)
        actual_result = 1  # Default to Draw
        if actual_home_score > actual_away_score:
            actual_result = 0  # Home Win
        elif actual_home_score < actual_away_score:
            actual_result = 2  # Away Win
        
        # Determine predicted full-time result
        predicted_result = max(
            ['home_win_prob', 'draw_prob', 'away_win_prob'],
            key=lambda x: row[x]
        )
        result_mapping = {'home_win_prob': 0, 'draw_prob': 1, 'away_win_prob': 2}
        predicted_result = result_mapping[predicted_result]
        
        # Correct if predicted result matches actual result or it's a draw
        if predicted_result == actual_result or actual_result == 1:
            correct_results += 1
        
        # Check BTTS
        actual_btts = (actual_home_score > 0) and (actual_away_score > 0)
        if row['p_both_score'] > 0.5:
            btts_count += 1
            if actual_btts:
                btts_hits += 1
        
        # Check thresholds (O/U)
        total_goals = actual_home_score + actual_away_score
        thresholds = [1.5, 2.5, 3.5]
        for threshold in thresholds:
            if row[f'O{threshold}'] > 0.5:
                over_count[threshold] += 1
                if total_goals > threshold:
                    over_hits[threshold] += 1
            if row[f'U{threshold}'] > 0.5:
                under_count[threshold] += 1
                if total_goals <= threshold:
                    under_hits[threshold] += 1

    # Calculate percentages and counts
    result_accuracy = {
        'correct': correct_results,
        'total': total_predictions - unmatched_predictions,
        'accuracy': correct_results / (total_predictions - unmatched_predictions) * 100
    }
    
    btts_accuracy = {
        'correct': btts_hits,
        'total': btts_count,
        'accuracy': btts_hits / btts_count * 100 if btts_count > 0 else 0
    }
    
    over_accuracy = {
        threshold: {
            'correct': over_hits[threshold],
            'total': over_count[threshold],
            'accuracy': (over_hits[threshold] / over_count[threshold] * 100 
                         if over_count[threshold] > 0 else 0)
        }
        for threshold in thresholds
    }
    
    under_accuracy = {
        threshold: {
            'correct': under_hits[threshold],
            'total': under_count[threshold],
            'accuracy': (under_hits[threshold] / under_count[threshold] * 100 
                         if under_count[threshold] > 0 else 0)
        }
        for threshold in thresholds
    }

    return {
        'result_accuracy': result_accuracy,
        'btts_accuracy': btts_accuracy,
        'over_accuracy': over_accuracy,
        'under_accuracy': under_accuracy,
        'unmatched_predictions': unmatched_predictions
    }


In [None]:
evaluation_results = evaluate_predictions(new_prediction_df, todays_games_df)

print("Full-Time Result Accuracy:", evaluation_results['result_accuracy'], "%")
print("BTTS Accuracy:", evaluation_results['btts_accuracy'], "%")
print("Over Threshold Accuracy:", evaluation_results['over_accuracy'])
print("Under Threshold Accuracy:", evaluation_results['under_accuracy'])
print("Unmatched Predictions:", evaluation_results['unmatched_predictions'])


In [None]:
def evaluate_predictions(predictions_df, actuals_df, confidence_range=(0.6, 0.7)):
    """
    Evaluates the prediction accuracy for full-time results, thresholds (O/U), and BTTS
    within a specified confidence range.
    
    Parameters:
    ----------
    predictions_df : pd.DataFrame
        DataFrame containing prediction probabilities for each match, including `eventId`.
    
    actuals_df : pd.DataFrame
        DataFrame containing actual match outcomes, including `eventId`.
    
    confidence_range : tuple, optional
        Range of confidence levels to filter predictions for accuracy evaluation.
    
    Returns:
    -------
    dict
        Evaluation results for full-time result accuracy, thresholds (O/U), and BTTS within confidence interval.
    """
    lower_bound, upper_bound = confidence_range
    
    correct_results = 0
    total_predictions = len(predictions_df)
    
    over_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    over_count = {1.5: 0, 2.5: 0, 3.5: 0}
    
    under_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    under_count = {1.5: 0, 2.5: 0, 3.5: 0}
    
    btts_hits = 0
    btts_count = 0
    unmatched_predictions = 0

    # Confidence-range specific counters for all metrics
    confidence_result_hits = 0
    confidence_result_total = 0

    confidence_over_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    confidence_over_count = {1.5: 0, 2.5: 0, 3.5: 0}
    
    confidence_under_hits = {1.5: 0, 2.5: 0, 3.5: 0}
    confidence_under_count = {1.5: 0, 2.5: 0, 3.5: 0}

    confidence_btts_hits = 0
    confidence_btts_total = 0

    for idx, row in predictions_df.iterrows():
        event_id = row['eventId']
        
        # Find corresponding actual match
        actual_match = actuals_df[actuals_df['eventId'] == event_id]

        if actual_match.empty:
            unmatched_predictions += 1
            logging.warning(f"No actual data found for eventId {event_id}. Skipping.")
            continue
        
        actual_match = actual_match.iloc[0]
        
        actual_home_score = actual_match['homeScore']
        actual_away_score = actual_match['awayScore']
        
        # Determine actual full-time result (0: Home Win, 1: Draw, 2: Away Win)
        actual_result = 1  # Default to Draw
        if actual_home_score > actual_away_score:
            actual_result = 0  # Home Win
        elif actual_home_score < actual_away_score:
            actual_result = 2  # Away Win
        
        result_mapping = {
            'home_win_prob': 0,
            'draw_prob': 1,
            'away_win_prob': 2
        }
        predicted_result = max(
            ['home_win_prob', 'draw_prob', 'away_win_prob'],
            key=lambda x: row[x]
        )

        confidence = row[predicted_result]  # Use the correct string key to get the confidence


        # Evaluate full-time result within confidence range
        if lower_bound <= confidence <= upper_bound:
            confidence_result_total += 1
            if result_mapping[predicted_result] == actual_result or actual_result == 1:
                confidence_result_hits += 1
        
        if predicted_result == actual_result or actual_result == 1:
            correct_results += 1
        
        # Check BTTS
        actual_btts = (actual_home_score > 0) and (actual_away_score > 0)
        if row['p_both_score'] > 0.5:
            btts_count += 1
            if actual_btts:
                btts_hits += 1
            # Evaluate BTTS within confidence range
            if lower_bound <= row['p_both_score'] <= upper_bound:
                confidence_btts_total += 1
                if actual_btts:
                    confidence_btts_hits += 1
        
        # Check thresholds (O/U)
        total_goals = actual_home_score + actual_away_score
        thresholds = [1.5, 2.5, 3.5]
        for threshold in thresholds:
            if row[f'O{threshold}'] > 0.5:
                over_count[threshold] += 1
                if total_goals > threshold:
                    over_hits[threshold] += 1
                # Confidence check for Over
                if lower_bound <= row[f'O{threshold}'] <= upper_bound:
                    confidence_over_count[threshold] += 1
                    if total_goals > threshold:
                        confidence_over_hits[threshold] += 1
            
            if row[f'U{threshold}'] > 0.5:
                under_count[threshold] += 1
                if total_goals <= threshold:
                    under_hits[threshold] += 1
                # Confidence check for Under
                if lower_bound <= row[f'U{threshold}'] <= upper_bound:
                    confidence_under_count[threshold] += 1
                    if total_goals <= threshold:
                        confidence_under_hits[threshold] += 1

    result_accuracy = {
        'correct': correct_results,
        'total': total_predictions - unmatched_predictions,
        'accuracy': correct_results / (total_predictions - unmatched_predictions) * 100
    }
    
    btts_accuracy = {
        'correct': btts_hits,
        'total': btts_count,
        'accuracy': btts_hits / btts_count * 100 if btts_count > 0 else 0
    }
    
    over_accuracy = {
        threshold: {
            'correct': over_hits[threshold],
            'total': over_count[threshold],
            'accuracy': (over_hits[threshold] / over_count[threshold] * 100 
                         if over_count[threshold] > 0 else 0)
        }
        for threshold in thresholds
    }
    
    under_accuracy = {
        threshold: {
            'correct': under_hits[threshold],
            'total': under_count[threshold],
            'accuracy': (under_hits[threshold] / under_count[threshold] * 100 
                         if under_count[threshold] > 0 else 0)
        }
        for threshold in thresholds
    }

    confidence_accuracy = {
        'result_accuracy': {
            'correct': confidence_result_hits,
            'total': confidence_result_total,
            'accuracy': (confidence_result_hits / confidence_result_total * 100 if confidence_result_total > 0 else 0)
        },
        'btts_accuracy': {
            'correct': confidence_btts_hits,
            'total': confidence_btts_total,
            'accuracy': (confidence_btts_hits / confidence_btts_total * 100 if confidence_btts_total > 0 else 0)
        },
        'over_accuracy': {
            threshold: {
                'correct': confidence_over_hits[threshold],
                'total': confidence_over_count[threshold],
                'accuracy': (confidence_over_hits[threshold] / confidence_over_count[threshold] * 100 
                             if confidence_over_count[threshold] > 0 else 0)
            }
            for threshold in thresholds
        },
        'under_accuracy': {
            threshold: {
                'correct': confidence_under_hits[threshold],
                'total': confidence_under_count[threshold],
                'accuracy': (confidence_under_hits[threshold] / confidence_under_count[threshold] * 100 
                             if confidence_under_count[threshold] > 0 else 0)
            }
            for threshold in thresholds
        }
    }

    return {
        'result_accuracy': result_accuracy,
        'btts_accuracy': btts_accuracy,
        'over_accuracy': over_accuracy,
        'under_accuracy': under_accuracy,
        'confidence_accuracy': confidence_accuracy,
        'unmatched_predictions': unmatched_predictions
    }


In [None]:
evaluation_results = evaluate_predictions(new_prediction_df, todays_games_df, confidence_range=(0.6, 0.7))

print("Full-Time Result Accuracy:", evaluation_results['result_accuracy'])
print("BTTS Accuracy:", evaluation_results['btts_accuracy'])
print("Over Threshold Accuracy:", evaluation_results['over_accuracy'])
print("Under Threshold Accuracy:", evaluation_results['under_accuracy'])
print("Confidence Range Accuracy (60-70%):", evaluation_results['confidence_accuracy'])
print("Unmatched Predictions:", evaluation_results['unmatched_predictions'])


In [None]:
predict(params,"Chelsea","Arsenal")