In [None]:
import itertools
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

import network
from preprocess_data import (
    GAMES,
    MAX_PENALTY_NUMBER,
    build_networks,
    Game,
    Venue,
    Situation,
    PowerPlay,
)

In [None]:
def regular_networks():
    for game, venue, (situation, pp) in itertools.product(GAMES, [Venue.HOME, Venue.AWAY], [(Situation.REGULAR, None)]):
        res = build_networks(Game(game), venue=venue, situation=situation, pp=pp)
        if res is not None:
            yield network.process_graph_for_analysis(res["position_pass_network"])


def power_play_networks():
    for game, venue, pp in itertools.product(
        GAMES, [Venue.HOME, Venue.AWAY], list(map(PowerPlay, range(1, MAX_PENALTY_NUMBER + 1)))
    ):
        res = build_networks(Game(game), venue=venue, situation=Situation.POWER_PLAY, pp=pp)
        if res is not None and len(res["position_pass_network"].nodes) > 0:
            yield network.process_graph_for_analysis(res["position_pass_network"])

In [None]:
"G.name are formatted like '2022-02-08 Canada at USA_home_pp2'"


def match_and_venue(G) -> Tuple[str, str]:
    game, venue, _ = G.name.split("_")
    return f"{game}_{venue}"


def match(G) -> str:
    return G.name.split("_")[0]


def pp_number(G) -> int:
    assert G.name[-1] in "12345678"
    return int(G.name[-1])


def covariates(feature: callable) -> pd.DataFrame:
    """
    Build the design matrix where the first column is the feature of the normal play
    network and the second column is the feature of the powerplay network.
    """
    regular_feature_map = {match_and_venue(G): feature(G) for G in regular_networks()}
    df = pd.DataFrame(
        [
            {"game_str": G.name, "normal": regular_feature_map[match_and_venue(G)], "pp": feature(G)}
            for G in power_play_networks()
        ]
    )
    return df


def goal_labels() -> list:
    pp_info = pd.read_csv("external/Big-Data-Cup-2022/data/pp_info.csv", comment="#")
    str_to_int = {"t": 1, "f": 0}
    # {(game_name, pp_number): goal}
    pp_to_goal = {(row.game_name, row.penalty_number): str_to_int[row.goal] for row in pp_info.itertuples()}
    y = [pp_to_goal[(match(G), pp_number(G))] for G in power_play_networks()]
    return y


def get_X_y(feature: callable) -> pd.DataFrame:
    df = covariates(feature)
    df["goal"] = goal_labels()
    return df

In [None]:
get_X_y(network.assortativity).head(10)

In [None]:
def fit(feature: callable):
    df = get_X_y(feature)
    X = df[["normal", "pp"]]
    X = sm.add_constant(X)
    y = df["goal"]
    model = sm.Logit(y, X).fit()
    print(model.summary())

In [None]:
for feature in [network.assortativity, network.clustering, network.connectivity, network.number_connected_components]:
    print(f"Feature: {feature.__name__}")
    fit(feature)
    print("\n\n")