In [None]:
#| default_exp datastructure.ts_format

In [None]:
#| hide

from IPython.core.debugger import set_trace

%load_ext autoreload
%autoreload 2

# Time Series (ts) format 
> Transform a pandas Dataframe to ts format.

In [None]:
# | export

import pandas as pd
from pathlib import Path
from typing import Dict, Tuple
from soccer_prediction_data.config.mongo import mongo_init
from soccer_prediction_data.datastructure.data_extractor import (
    STATS,
    COMPETITION_IDS,
    data_aggregator,
)

We set up the metadata for the ts files.

In [None]:
# | export

FILE_METADATA = {
    "source": "InStat",
    "creator": "Real-Analytics",
    "data_description": """More than 5 years of football data from leagues across the world that includes stats from both competing teams in each played game in each dimension.\n#The data describes a series of numbers achieved by each squad, which indicate their athletic performance.\n#This yields a database of almost 52k time series.""",
    "task": "The task is to  predict the outcomes of a set of soccer matches from leagues worldwide.",
    "problem_name": "soccer-preds",
    "time_stamps": "false",
    "missing": "true",
    "univariate": "false",
    "dimensions": 1,
    "equal_length": "true",
    "class_label": "true",
}

## Split data

We provide a function that divides data by competition into train and test sets before storing it in separate ts files.
The user should specify a month-cutoff value indicating when data should be partitioned.

In [None]:
def split_data(
    games: pd.DataFrame,  # All Games.
    cutoff_month: int,  # Month-threshold to split data.
) -> Tuple:  # train and test set.
    "Split data into train/test sets using a cutoff date."

    # Group games by competition.S
    grp_games = games.groupby("competitionName")
    train_data = []
    test_data = []
    # Loop over compet.
    for lg, lg_games in grp_games:
        lg_games = lg_games.sort_values("gameDate")
        # Calculate the date for the test set period.
        test_set_date = lg_games["gameDate"].max() - pd.DateOffset(months=cutoff_month)
        # Split data into a training set and a test set.
        train_set = lg_games[lg_games["gameDate"] < test_set_date]
        test_set = lg_games[lg_games["gameDate"] >= test_set_date]
        train_data.append(train_set)
        test_data.append(test_set)

    train_data = pd.concat(train_data)
    test_data = pd.concat(test_data)
    return train_data, test_data

## TS File

We supply a function, here, that will prepare the ts file format for storing game information.
Each row contains each team's statistics for the current game and the previous game period.
We also included gameId, home and away team Ids, and goals scored by the provided team as target values.

In [None]:
# | export


def create_ts_file(
    df: pd.DataFrame,  # Pandas Dataframe input.
    file_path: str = ".",  # Where should we save our file ??.
    file_name: str = "games",  # File name.
    file_metadata: Dict = FILE_METADATA,  # File metadata.
) -> None:
    "Create a ts file from a Pandas dataframe."

    # Check path.
    Path(file_path).mkdir(parents=True, exist_ok=True)

    # Create an empty ts file.
    with open(f"{file_path}/{file_name}.ts", "w") as f:
        # Add data length in file metadata.
        file_metadata["series_length"] = len(STATS)+1
        # Init header file information.
        header = "\n".join(
            (
                f'#Source: {file_metadata["source"]}',
                f'#Creator: {file_metadata["creator"]}',
                "#",
                "#Data Set Information:",
                "#",
                f'#{file_metadata["data_description"]}',
                "#",
                f'#{file_metadata["task"]}',
                f'@problemName {file_metadata["problem_name"]}',
                f'@timeStamps {file_metadata["time_stamps"]}',
                f'@missing {file_metadata["missing"]}',
                f'@univariate {file_metadata["univariate"]}',
                f'@dimensions {file_metadata["dimensions"]}',
                f'@equalLength {file_metadata["equal_length"]}',
                f'@seriesLength {file_metadata["series_length"]}',
                f'@classLabel {file_metadata["class_label"]}',
                f"@data",
            )
        )
        # Add header file information.
        f.write(header)
        # Init teams dict.
        team_last_game = {}
        # Loop over data to extract info.
        for _, row in df.iterrows():
            # Extract game information.
            # Home team features.
            home_team_id = row["homeTeamId"]
            home_team_feats = row.filter(like="homeTeam")[2:].tolist()
            # Away team features.
            away_team_id = row["awayTeamId"]
            away_team_feats = row.filter(like="awayTeam")[2:].tolist()
            
            # Add temporal feature.
            # Home.
            home_team_period = 0
            if home_team_id in team_last_game:
                home_team_period = (
                    row["gameDate"] - team_last_game[home_team_id]
                ).days + 1
            team_last_game[home_team_id] = row["gameDate"]

            # Away.
            away_team_period = 0
            if away_team_id in team_last_game:
                away_team_period = (
                    row["gameDate"] - team_last_game[away_team_id]
                ).days + 1
            team_last_game[away_team_id] = row["gameDate"]

            # Put on each row each team features.
            # Add target values(gameId, home and away team Id, scored goals by the given team)
            h_data_str = f"{','.join(str(val) for val in home_team_feats)},{home_team_period}:{row.gameId},{home_team_id},{away_team_id},{row.HS}"
            a_data_str = f"{','.join(str(val) for val in away_team_feats)},{away_team_period}:{row.gameId},{home_team_id},{away_team_id},{row.AS}"
            # Write stringq to the ts file.
            f.write("\n" + h_data_str + "\n" + a_data_str)

In [None]:
mongo_init(db_host="prod_atlas")
games, future_games = data_aggregator(
    competition_ids=COMPETITION_IDS, stats_attributes=STATS, limit=1000
)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
train_data, test_data = split_data(games=games, cutoff_month=1)

In [None]:
create_ts_file(
    df=train_data,
    file_path=".",
    file_name="train_data",
    file_metadata=FILE_METADATA,
)
create_ts_file(
    df=test_data,
    file_path=".",
    file_name="test_data",
    file_metadata=FILE_METADATA,
)

In [None]:
#| hide

import nbdev

nbdev.nbdev_export()