In [None]:
#| default_exp datastructure.data_extractor

In [None]:
#| hide

from IPython.core.debugger import set_trace

%load_ext autoreload
%autoreload 2

# Data Extractor
> Extract games and its features from multiple DB collections.

In [None]:
#| export

import pandas as pd
from tqdm.auto import tqdm
from typing import List, Tuple
from soccer_prediction_data.config.mongo import mongo_init
from soccer_prediction_data.datastructure.fixture import * 
from soccer_prediction_data.datastructure.team_stats import *

We provide a collection of stats for each team as well as a list of competitions that we are interested in.

Here for the team stats, we have chosen these attributes:

**Ball possession in own half**: The total time of ball possessions in the the half of the pitch nearest to its goal.

**Ball possession in opp. half**: The total time of ball possessions in the the half of the pitch nearest to opponent goal.

**Challenges / won**: Successful challenge is registered for a player of a team that keeps possession of a ball after such challenge.

**Challenges lost**: Successful challenge is registered for a player of the oponnent team that keeps possession of a ball after such challenge.

**Challenge intensity index**: The number of challenges and interceptions made by the  defending team per minute of opposition possession 

**Passes accurate**: The number of correct passes.

**Accurate passes into the final third of the pitch**: The number of correct passes in the part of the field nearest to the opponent's goal.

**Passes into the penalty box**: The number of passes inside the opposition's penalty area.

**Diagonal passes**: The number of long passes forward from the vertical zones of own third.

**Key passes**: The number of passes that created a goal-scoring opportunity.

**Crosses accurate**:  The number of correct medium- to long-range passes intended to create goal-scoring opportunities.

**Crosses to the near post - % efficiency**: The percentage of correct crosses that are aimed to arrive at the goal post nearest to the player who kicked the ball.

**Crosses into the six-yard box - % efficiency** : The percentage of correct crosses that are aimed to arrive at the six-yard box nearest to the player who kicked the ball.

**Shots on target**: The number of shots that are reflected by the opponent goalkeeper.

**GK actions - Shots saved**:  All shots on target saved by the team goalkeeper.

**xG per shot**: Average xG (probability of a goal being scored from a particular position with a shot of a specific type) for all shots of the team calculated by dividing the sum of xG for all shots by the number of shots.

**Chance TOTAL**: The number of goal-scoring opportunities, when the attacking team gets a clear-cut chance to score a goal.

**Chances, % of conversion**: Percentage share of chances successful in the total number of chances.

**Attacking mentality index**: The ratio between active attacking actions of the team and  actions aimed to keep and control the ball (non-attacking passes). The higher value of such an  index is the evidence of a more attacking and direct style of play of the team.

**Dribbles successful**: The number of successful active action performed by a player in order to get through an opponent.

**Dribbles unsuccessful**: The number of unsuccessful active action performed by a player in order to get through an opponent.

**Successful tackles**: The number of successful attempts of tackles, as a result an opponent’s player loses the ball while performing a dribble.

**Unsuccessful tackles**: The number of unsuccessful attempts of tackles.

**Interceptions / in opp. half**: The number of successful actions to prevent an opponent pass in the opponent half side.

**Ball recoveries / in opp. half**: The number of ball recoveries occurred in team’s opponent’s half of the pitch.

**Fouls**: The number of unfair actions.

**Lost balls / in own half**: The number of unsuccessful dribblings, bad ball controls, non attacking or attacking passes in the team own half.

**Average distance to the goal at ball losses**: The average distance to the own goal side , the team covers when lossing the ball.

**Average distance to the goal at ball recoveries**: The average distance the team covers until ball recovery.

In [None]:
# |export

STATS = [
    "Ball possession in own half",
    "Ball possession in opp. half",
    "Challenges / won",
    "Challenges lost",
    "Challenge intensity index",
    "Passes accurate",
    "Accurate passes into the final third of the pitch",
    "Passes into the penalty box",
    "Diagonal passes",
    "Key passes",
    "Crosses accurate",
    "Crosses to the near post - % efficiency",
    "Crosses into the six-yard box - % efficiency",
    "Shots on target",
    "GK actions - Shots saved",
    "xG per shot",
    "Chance TOTAL",
    "Chances, % of conversion",
    "Attacking mentality index",
    "Dribbles successful",
    "Dribbles unsuccessful",
    "Successful tackles",
    "Unsuccessful tackles",
    "Interceptions / in opp. half",
    "Ball recoveries / in opp. half",
    "Fouls",
    "Lost balls / in own half",
    "Average distance to the goal at ball losses",
    "Average distance to the goal at ball recoveries",
]

COMPETITION_IDS = [
    93,  # Argentina. Primera Division
    95,  # Australia. A-League
    45,  # Belgium. Jupiler Pro League
    78,  # Switzerland. Super League
    213,  # Chile. Primera Division
    193,  # Algeria. Ligue 1
    39,  # England. Premier League
    105,  # England. Championship
    123,  # England. League One
    76,  # England. League Two
    936,  # England. National League North
    37,  # France. Ligue 1
    110,  # France. Ligue 2
    496,  # France. Championnat National
    31,  # Germany. Bundesliga
    72,  # Germany. 2. Bundesliga
    464,  # Germany. 3. Liga
    29,  # Netherlands. Eredivisie
    24,  # Italy. Serie A
    80,  # Italy. Serie B
    112,  # Morocco. GNF 1
    108,  # Mexico. Liga MX
    86,  # Norway. Eliteserien
    28,  # Portugal. Liga NOS
    1,  # Russia. Premier League
    9,  # Russia. FNL
    103,  # Scotland. Premier League
    300,  # Scotland. Championship
    792,  # Scotland. League One
    903,  # Scotland League Two
    20,  # Spain. Primera Division
    109,  # Spain. Segunda Division
    52,  # Sweden. Allsvenskan
    41,  # United States. MLS
    307,  # South Africa. PSL
]

## Aggregate Data


We provide a function that seeks to retrieve the list of games recorded in our `inStat.Fixture` MongoDb Collection and aggregate it with its additional teams features (Stats) recorded in the `inStat.teamSats`.

In [None]:
#| export


def data_aggregator(
    competition_ids: List[int] = COMPETITION_IDS,  # Competitions to extract.
    stats_attributes: List[str] = STATS, # Stats attributes to extract.
    limit: int = None,  # Number of rows to extract.
) -> Tuple:  # Mapped games (played games and future games).
    "Returns and aggregates games information from multiple Db collections."

    def _team_stats(
        game_id: int,  # Instat game identifier.
        team_id: int,  # Instat team identifier.
    ) -> pd.DataFrame:  # Team Stats
        "Returns stats of a given team in a given game."

        # Team features.
        team_feats = TeamStats.get_game_team_stats(
            game_id=game_id,
            team_id=team_id,
        )
        if team_feats is None:
            return pd.DataFrame(index=[0])

        # Team stats.
        team_stats = {
            stat.action_name.strip()
            .replace("- ", "")
            .replace(", ", "")
            .replace("/ ", "")
            .title()
            .replace(" ", ""): stat.value
            for stat in team_feats.stats
            if stat.action_name in stats_attributes
        }

        return pd.DataFrame(team_stats, index=[0])

    # Extract games.
    games = Fixture.get_games_by_competition(
        competition_ids=competition_ids, limit=limit
    )
    games = pd.DataFrame(games.as_pymongo())

    # Map results {HS: Home goals scored, AS: Away goals scored}.
    games[["HS", "AS"]] = games["fullTimeScore"].apply(
        lambda lst: pd.Series(
            [
                (lst[0] if len(lst) > 0 else -1),
                (lst[1] if len(lst) > 0 else -1),
            ]
        )
        if isinstance(lst, list)
        else pd.Series([-1, -1])
    )

    # Filter Data.
    games = games[
        [
            "gameId",
            "gameDate",
            "seasonName",
            "competitionName",
            "homeTeamId",
            "homeTeamName",
            "awayTeamId",
            "awayTeamName",
            "HS",
            "AS",
        ]
    ]

    # Filter df.
    future_games = games[games.HS == -1].copy()
    played_games = games[games.HS != -1].copy()

    if played_games.empty:
        return None, future_games

    # Compute other features.
    def _one_game(row):
        ht_stats = _team_stats(
            game_id=row["gameId"], team_id=row["homeTeamId"]
        ).add_prefix("homeTeam")

        at_stats = _team_stats(
            game_id=row["gameId"], team_id=row["awayTeamId"]
        ).add_prefix("awayTeam")

        res = pd.concat([ht_stats, at_stats], axis=1)
        res.loc[:, "gameId"] = row.gameId

        return res

    played_games = played_games.merge(
        pd.concat(
            [
                _one_game(row)
                for _, row in tqdm(played_games.iterrows(), total=played_games.shape[0])
            ]
        ).reset_index(drop=True),
        on="gameId",
        how="left",
    )

    return played_games, future_games

In [None]:
mongo_init(db_host="prod_atlas")

played_games, future_games = data_aggregator(
    competition_ids=COMPETITION_IDS, stats_attributes=STATS, limit=1
)
played_games

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,gameId,gameDate,seasonName,competitionName,homeTeamId,homeTeamName,awayTeamId,awayTeamName,HS,AS,...,awayTeamBallPossessionInOwnHalf,awayTeamBallPossessionInOpp.Half,awayTeamCrossesIntoTheSix-YardBox%Efficiency,awayTeamChanceTotal,awayTeamChances%OfConversion,awayTeamAccuratePassesIntoTheFinalThirdOfThePitch,awayTeamChallengesLost,awayTeamDribblesUnsuccessful,awayTeamUnsuccessfulTackles,awayTeamXgPerShot
0,1108249,2018-02-03 02:00:00,2018,Chile. Primera Division,1473,Universidad Catolica,1469,Deportes Temuco,2,1,...,844,567,0,7,14,108,97,11,20,0


In [None]:
#| hide

import nbdev

nbdev.nbdev_export()