# Modelo para apuestas Premier League

- El predictor está basado en el Pi rating
- Primero cargamos en dataframes los datos de la temporada actual y la anterior, se usa  
  la temporada anterior para calcular los elos iniciales de la actual.

In [None]:
from typing import Tuple, List
import numpy as np
import pandas as pd
import pandasql as pdsql
import matplotlib.pyplot as plt
import sys
import math
import seaborn as sns


# current_season: str = f"{sys.argv[1]}-{int(sys.argv[2]) - 2000}"
# last_season: str = f"{int(sys.argv[1]) - 1:02d}-{int(sys.argv[1]) - 2000:02d}"

first_season: str = "2018-19"
last_season: str = "2023-24"


full_df: pd.DataFrame = pd.DataFrame()
for season in range(int(first_season[:4]), int(last_season[:4]) + 1):
    season_str: str = f"{season:02d}-{season + 1 - 2000:02d}"
    season_df: pd.DataFrame = pd.read_csv(f"Datasets/bundesliga/{season_str}.csv")
    season_df["Season"] = season_str
    full_df = pd.concat([full_df, season_df])
full_df.reset_index(drop=True, inplace=True)
full_df.head(20)


- Definimos funciones  para manipular rating.

In [None]:
LOG_BASE: int = 10
ERROR_CONSTANT: int = 3
HOME_PERFORMANCE_LEARNING_RATE: int = 0.7 # \gamma
GOAL_PERFORMANCE_LEARNING_RATE: int = 0.035 # \lambda

def get_new_pi_rating(rating_H_H: int, rating_H_A: int, rating_A_H: int, rating_A_A: int, score_H: int, score_A: int) -> Tuple[int, int, int, int]:
  expected_goal_diff = get_expected_goal_diff(rating_H_H, rating_A_A)
  observed_goal_diff = score_H - score_A
  error = abs(expected_goal_diff - observed_goal_diff)
  weighted_error_H = weighting_error(error) if expected_goal_diff < observed_goal_diff else -weighting_error(error)
  weighted_error_A = weighting_error(error) if expected_goal_diff > observed_goal_diff else -weighting_error(error)

  new_rating_H_H = rating_H_H + weighted_error_H * GOAL_PERFORMANCE_LEARNING_RATE
  new_rating_H_A = rating_H_A + (new_rating_H_H - rating_H_H) * HOME_PERFORMANCE_LEARNING_RATE
  new_rating_A_A = rating_A_A + weighted_error_A * GOAL_PERFORMANCE_LEARNING_RATE
  new_rating_A_H = rating_A_H + (new_rating_A_A - rating_A_A) * HOME_PERFORMANCE_LEARNING_RATE

  return (new_rating_H_H, new_rating_H_A, new_rating_A_H, new_rating_A_A)

def get_expected_goal_diff(rating_h_h, rating_a_a) -> float:
  if (rating_h_h >= 0):
    expected_goals_h = 10 ** np.abs((rating_h_h / ERROR_CONSTANT)) - 1
  else:
    expected_goals_h = - (10 ** np.abs((rating_h_h / ERROR_CONSTANT)) - 1)
  if (rating_a_a >= 0):
    expected_goals_a = 10 ** np.abs((rating_a_a / ERROR_CONSTANT)) - 1
  else:
    expected_goals_a = - (10 ** np.abs((rating_a_a / ERROR_CONSTANT)) - 1)
  
  return expected_goals_h - expected_goals_a



def weighting_error(goal_diff: int) -> float:
  return ERROR_CONSTANT * math.log(goal_diff + 1, LOG_BASE)


In [None]:
df_teams = pdsql.sqldf("SELECT DISTINCT HomeTeam as TEAM FROM full_df")
df_teams["H_RATING"] = df_teams["A_RATING"] = 0.0

# set ratings as float
df_teams["H_RATING"] = df_teams["A_RATING"].astype(float)
df_stats = pd.DataFrame()
df_stats["RATING_DIFF"] = df_stats["RESULT"] = 0

for index, row in full_df.iterrows():
  ratings_diff= df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "H_RATING"].values[0] - df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "A_RATING"].values[0]
  result = 1 if row["FTHG"] > row["FTAG"] else 0 if row["FTHG"] == row["FTAG"] else -1

  df_stats.loc[index, "RATING_DIFF"] = ratings_diff
  df_stats.loc[index, "RESULT"] = result

  full_df.loc[index, "RATING_DIFF"] = ratings_diff
  full_df.loc[index, "H_RATING"] = df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "H_RATING"].values[0]
  full_df.loc[index, "A_RATING"] = df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "A_RATING"].values[0]
  
  new_elos: Tuple[int, int, int, int] = get_new_pi_rating(
    df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "H_RATING"].values[0],
    df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "A_RATING"].values[0],
    df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "H_RATING"].values[0],
    df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "A_RATING"].values[0],
    row["FTHG"],
    row["FTAG"]
    )
                                                 
  df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "H_RATING"] = new_elos[0]
  df_teams.loc[df_teams["TEAM"] == row["HomeTeam"], "A_RATING"] = new_elos[1]
  df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "H_RATING"] = new_elos[2]
  df_teams.loc[df_teams["TEAM"] == row["AwayTeam"], "A_RATING"] = new_elos[3]


df_teams.head(100)


In [None]:
# Write full_df to csv
full_df.to_csv("Datasets/premier/full_df.csv", index=False)
df_teams.to_csv("Datasets/premier/df_teams.csv", index=False)

In [None]:
BINS = 28
# Minimum and maximum rating diffs
min_rating_diff = df_stats["RATING_DIFF"].min()
max_rating_diff = df_stats["RATING_DIFF"].max()
print(df_stats["RATING_DIFF"].min())
print(df_stats["RATING_DIFF"].max())

# Create bins by their left edge
bins = np.linspace(min_rating_diff, max_rating_diff, BINS)
# take into account the left most and right most bins
bins[0] = bins[0] - 2
bins[-1] = bins[-1] + 2

# Merge bins so that each bin has at least 20 observations
while True:
  hist, bin_edges = np.histogram(df_stats["RATING_DIFF"], bins=bins)
  if (hist.min() > 20):
    break
  else:
    bins = np.delete(bins, np.argmin(hist) + 1)

# print bins and count ordered by bin left value
print("Bins:")
for i in range(len(bins) - 1):
  print(f"{bins[i]} - {bins[i + 1]}: {hist[i]}")


df_bins = pd.DataFrame()
df_bins["BIN_LEFT"] = bins[:-1]
df_bins["BIN_RIGHT"] = bins[1:]
df_bins["COUNT"] = hist

# Calculate empirical probability of home win, draw and away win for each bin'
df_bins["H_WINS"] = df_bins["DRAWS"] = df_bins["A_WINS"] = 0.0

for index, row in df_bins.iterrows():
  df_bin = df_stats.loc[(df_stats["RATING_DIFF"] >= row["BIN_LEFT"]) & (df_stats["RATING_DIFF"] < row["BIN_RIGHT"])]
  df_bins.loc[index, "H_WINS"] = df_bin.loc[df_bin["RESULT"] == 1].shape[0] / df_bin.shape[0]
  df_bins.loc[index, "DRAWS"] = df_bin.loc[df_bin["RESULT"] == 0].shape[0] / df_bin.shape[0]
  df_bins.loc[index, "A_WINS"] = df_bin.loc[df_bin["RESULT"] == -1].shape[0] / df_bin.shape[0]

df_bins.head(100)



In [None]:
# write probabilities to csv
df_bins.to_csv("Datasets/pi_rating_probabilities.csv", index=False)

In [None]:
# visualize the relationship between the rating difference and the probability of the home team winning

plt.figure(figsize=(12, 8))
plt.plot(df_bins["BIN_LEFT"], df_bins["H_WINS"], label="Home Win")
plt.plot(df_bins["BIN_LEFT"], df_bins["DRAWS"], label="Draw")
plt.plot(df_bins["BIN_LEFT"], df_bins["A_WINS"], label="Away Win")
sns.set_style("darkgrid")
sns.set_context("talk")
sns.set_palette("husl")

# center arount 0, add a title and axis labels
plt.xlim(-5, 5)
plt.title("Probability of Home Win, Draw, Away Win")
plt.xlabel("Rating Difference")
plt.ylabel("Probability")
plt.legend()
plt.show()