In [None]:
!pip install ydata-profiling
!pip install catboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier, Pool

from queue import Queue

from ydata_profiling import ProfileReport

In [None]:
df = pd.read_csv("/content/a.csv")

tgt: 0 - "home win", 1 - "away win", 2 - "draw"


data: [home_stat, away_stat, curr_stat]

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

Отсортируем датасет по дате (по возрастанию)

In [None]:
def sort_by_date(df, column=['Season', 'Date'], ascending=True, reset_index=True):
    df_sorted = df.copy()

    def parse_date_with_formats(date_str):
        date_str = str(date_str).strip().split("/")
        assert len(date_str) == 3

        # year
        result = ""
        if len(date_str[2]) == 2:
          result += "20" + date_str[2]
        elif len(date_str[2]) == 4:
          result += date_str[2]
        else:
          raise Exception

        #month
        result += "/" + date_str[1]

        # day
        result += "/" + date_str[0]
        return result

    df_sorted["Date"] = df_sorted["Date"].apply(parse_date_with_formats)

    df_sorted = df_sorted.sort_values(by=column, ascending=ascending)

    if reset_index:
        df_sorted = df_sorted.reset_index(drop=True)

    return df_sorted

In [None]:
df_sort_date = sort_by_date(df)

In [None]:
df_sort_date.head()

Будем получать фичи из окна 3 форматов (пока): 1 сезон: 5 матчей, 1 матч

In [None]:
df_sort_date["FTR"].unique()

In [None]:
from enum import Enum

class Window(Enum):
    SEASON = "season"
    TOUR = "day"

In [None]:
from copy import copy, deepcopy

stats_cols = [
    '_S', '_ST', '_F', '_C', '_Y', '_R', 'FT_G', 'HT_G'
  ]
current_stats_cols = ['WHH', 'WHD', 'WHA']
cat_cols = ['HomeTeam', 'AwayTeam', 'Referee', 'Season', 'Date']
new_stats_cols = ['WinCount', 'LoseCount', 'Draw', 'HTWinCount', 'HTLoseCount', 'HTDraw']

def update_data(curr_data, team_stat, prefix, index):
  team_stat["used_indxs"].put(index)
  team_stat["games_count"] += 1
  if prefix == "H":
    team_stat["WinCount"] += int(curr_data["FTR"] == "H")
    team_stat["LoseCount"] += int(curr_data["FTR"] == "A")
    team_stat["Draw"] += int(curr_data["FTR"] == "D")

    team_stat["HTWinCount"] += int(curr_data["HTR"] == "H")
    team_stat["HTLoseCount"] += int(curr_data["HTR"] == "A")
    team_stat["HTDraw"] += int(curr_data["HTR"] == "D")
  if prefix == "A":
    team_stat["WinCount"] += int(curr_data["FTR"] == "A")
    team_stat["LoseCount"] += int(curr_data["FTR"] == "H")
    team_stat["Draw"] += int(curr_data["FTR"] == "D")

    team_stat["HTWinCount"] += int(curr_data["HTR"] == "H")
    team_stat["HTLoseCount"] += int(curr_data["HTR"] == "A")
    team_stat["HTDraw"] += int(curr_data["HTR"] == "D")

  # features mean
  for col in stats_cols:
    col_copy = copy(col).replace("_", prefix)
    team_stat[col] += curr_data[col_copy]
  return team_stat

def remove_data(curr_data, team_stat, prefix):
  team_stat["games_count"] -= 1
  if prefix == "H":
    team_stat["WinCount"] -= int(curr_data["FTR"] == "H")
    team_stat["LoseCount"] -= int(curr_data["FTR"] == "A")
    team_stat["Draw"] -= int(curr_data["FTR"] == "D")

    team_stat["HTWinCount"] -= int(curr_data["HTR"] == "H")
    team_stat["HTLoseCount"] -= int(curr_data["HTR"] == "A")
    team_stat["HTDraw"] -= int(curr_data["HTR"] == "D")
  if prefix == "A":
    team_stat["WinCount"] -= int(curr_data["FTR"] == "A")
    team_stat["LoseCount"] -= int(curr_data["FTR"] == "H")
    team_stat["Draw"] -= int(curr_data["FTR"] == "D")

    team_stat["HTWinCount"] -= int(curr_data["HTR"] == "H")
    team_stat["HTLoseCount"] -= int(curr_data["HTR"] == "A")
    team_stat["HTDraw"] -= int(curr_data["HTR"] == "D")

  # features mean
  for col in stats_cols:
    col_copy = copy(col).replace("_", prefix)
    team_stat[col] -= curr_data[col_copy]
  return team_stat

def preapre_data(data, window_type=Window.SEASON, not_train_season="2000", diff=1):
  results = []
  add_column_name = "_" + window_type.value + "_" + str(diff)

  # подготовим структуру для сбора статистики
  team_stats = dict()
  for team in set(df_sort_date["AwayTeam"].unique()) | set(df_sort_date["HomeTeam"].unique()):
    team_stats[team] = dict()
    team_stats[team]["used_indxs"] = Queue()
    team_stats[team]["games_count"] = 0
    for col in stats_cols:
      team_stats[team][col] = 0.0
    for col in new_stats_cols:
      team_stats[team][col] = 0.0

  # заполняем статистику
  prev_season = None
  for index, row in data.iterrows():
    if not_train_season in row["Season"]:
      # home team update
      team_stats[row["HomeTeam"]] = update_data(row, team_stats[row["HomeTeam"]], "H", index)
      # away team update
      team_stats[row["AwayTeam"]] = update_data(row, team_stats[row["AwayTeam"]], "A", index)
      prev_season = row["Season"]
      continue

    # remove old data (look at window type)
    if window_type == Window.SEASON:
      # remove_season | prev season | curr season
      if prev_season != row["Season"]:
        y1, y2 = map(int, prev_season.split("-"))
        remove_season = str(y1-diff) + "-" + str(y2-diff)
        for _, old_row in data[data["Season"] == remove_season].iterrows():
          # home team update
          team_stats[old_row["HomeTeam"]] = remove_data(old_row, team_stats[old_row["HomeTeam"]], "H")
          # away team update
          team_stats[old_row["AwayTeam"]] = remove_data(old_row, team_stats[old_row["AwayTeam"]], "A")
    elif window_type == Window.TOUR:
      # remove matches | 5 matches | curr_match
      # home team
      while team_stats[row["HomeTeam"]]["games_count"] > diff:
        old_ind = team_stats[row["HomeTeam"]]["used_indxs"].get()
        old_row = data.iloc[old_ind]
        if old_row["HomeTeam"] == row["HomeTeam"]:
          team_stats[row["HomeTeam"]] = remove_data(old_row, team_stats[row["HomeTeam"]], "H")
        else:
          team_stats[row["HomeTeam"]] = remove_data(old_row, team_stats[row["HomeTeam"]], "A")
      # away team
      while team_stats[row["AwayTeam"]]["games_count"] > diff:
        old_ind = team_stats[row["AwayTeam"]]["used_indxs"].get()
        old_row = data.iloc[old_ind]
        if old_row["HomeTeam"] == row["AwayTeam"]:
          team_stats[row["AwayTeam"]] = remove_data(old_row, team_stats[row["AwayTeam"]], "H")
        else:
          team_stats[row["AwayTeam"]] = remove_data(old_row, team_stats[row["AwayTeam"]], "A")
    else:
      raise Exception("Unknown window type")

    # prepare data for curr x row
    x = dict()

    # set target
    if row["FTR"] == "H":
      x["target"] = 0
    elif row["FTR"] == "A":
      x["target"] = 1
    else:
      x["target"] = 2

    # features mean
    # print(row["Season"], team_stats[row["HomeTeam"]]["games_count"], team_stats[row["AwayTeam"]]["games_count"])
    for col in stats_cols:
      # Home
      col_copy = copy(col).replace("_", "H")
      if team_stats[row["HomeTeam"]]["games_count"] != 0:
        x[col_copy+add_column_name] = (team_stats[row["HomeTeam"]][col] / team_stats[row["HomeTeam"]]["games_count"])
      else:
        x[col_copy+add_column_name] = 0

      # Away
      col_copy = copy(col).replace("_", "A")
      if team_stats[row["AwayTeam"]]["games_count"] != 0:
        x[col_copy+add_column_name] = (team_stats[row["AwayTeam"]][col] / team_stats[row["AwayTeam"]]["games_count"])
      else:
        x[col_copy+add_column_name] = 0
    # features sum
    for col in new_stats_cols:
      # Home
      col_copy = "H" + copy(col)
      x[col_copy+add_column_name] = team_stats[row["HomeTeam"]][col]

      # Away
      col_copy = "A" + copy(col)
      x[col_copy+add_column_name] = team_stats[row["AwayTeam"]][col]
    # curr features
    for col in current_stats_cols:
      x[col+add_column_name] = row[col]
    for col in cat_cols:
      x[col+add_column_name] = row[col]

    results.append(deepcopy(x))

    # save curr season
    prev_season = row["Season"]

    # home team update
    team_stats[row["HomeTeam"]] = update_data(row, team_stats[row["HomeTeam"]], "H", index)
    # away team update
    team_stats[row["AwayTeam"]] = update_data(row, team_stats[row["AwayTeam"]], "A", index)

  return results

EDA на фичи в 1 сезон

In [None]:
final_data = pd.DataFrame(preapre_data(df_sort_date, Window.SEASON, "2000", 1))

In [None]:
final_data.head()

In [None]:
ProfileReport(final_data, title="Profiling Report")

EDA

1. Таргет - сильного дисбаланса нет, доминирует "ВЫиграла Home team"

2. сильной окррлеяции у таргета с фичами нет. Максимальное абсолютное число около 0.2 (где сильным ялвяется ставка букмекера или же название команды)

EDA на окно в 5 матчей

In [None]:
final_data_5_tour = pd.DataFrame(preapre_data(df_sort_date, Window.TOUR, "2000", 5))

In [None]:
final_data_5_tour.head()

In [None]:
ProfileReport(final_data_5_tour, title="Profiling Report")

EDA на окно в 1 матч

In [None]:
final_data_1_tour = pd.DataFrame(preapre_data(df_sort_date, Window.TOUR, "2000", 1))

In [None]:
final_data_1_tour.head()

In [None]:
final_data_1_tour

In [None]:
ProfileReport(final_data_1_tour, title="Profiling Report")

Обучение

In [None]:
y = final_data["target"]
X = final_data.drop(columns=["target", 'Season', 'Date'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

In [None]:
X_train.shape, X_test.shape

In [None]:
train_pool = Pool(X_train, y_train, cat_features=['HomeTeam', 'AwayTeam', 'Referee'])
test_pool = Pool(X_test, y_test, cat_features=['HomeTeam', 'AwayTeam', 'Referee'])

In [None]:
base_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=10,
    random_seed=42
)

In [None]:
base_model.fit(train_pool, eval_set=test_pool, plot=True)

In [None]:
y_pred_base = base_model.predict(X_test)
y_pred_proba_base = base_model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred_base)
f1 = f1_score(y_test, y_pred_base, average='weighted')
print(accuracy, f1)

In [None]:
gs_model = CatBoostClassifier(
    loss_function='MultiClass',
    verbose=10,
    random_seed=42
)

In [None]:
grid = {'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'iterations': [100, 300, 500],
        'grow_policy': ['SymmetricTree', 'Depthwise']}

grid_search_result = gs_model.grid_search(grid,
                                       X=train_pool,
                                       plot=True)

In [None]:
y_pred_gs = gs_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gs)
f1 = f1_score(y_test, y_pred_gs, average='weighted')
print(accuracy, f1)

acc = 0.5370985603543743 f1 = 0.46558453429686003