In [1]:
import itertools
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

import network
from preprocess_data import (
    GAMES,
    MAX_PENALTY_NUMBER,
    build_networks,
    Game,
    Venue,
    Situation,
    PowerPlay,
)

In [2]:
def regular_networks():
    for game, venue, (situation, pp) in itertools.product(GAMES, [Venue.HOME, Venue.AWAY], [(Situation.REGULAR, None)]):
        res = build_networks(Game(game), venue=venue, situation=situation, pp=pp)
        if res is not None:
            yield network.process_graph_for_analysis(res["position_pass_network"])


def power_play_networks():
    for game, venue, pp in itertools.product(
        GAMES, [Venue.HOME, Venue.AWAY], list(map(PowerPlay, range(1, MAX_PENALTY_NUMBER + 1)))
    ):
        res = build_networks(Game(game), venue=venue, situation=Situation.POWER_PLAY, pp=pp)
        if res is not None and len(res["position_pass_network"].nodes) > 0:
            yield network.process_graph_for_analysis(res["position_pass_network"])

In [3]:
"G.name are formatted like '2022-02-08 Canada at USA_home_pp2'"


def match_and_venue(G) -> str:
    game, venue, _ = G.name.split("_")
    return f"{game}_{venue}"


def match(G) -> str:
    return G.name.split("_")[0]


def pp_number(G) -> int:
    assert G.name[-1] in "12345678"
    return int(G.name[-1])


def covariates(feature: callable) -> pd.DataFrame:
    """
    Build the design matrix where the first column is the feature of the normal play
    network and the second column is the feature of the powerplay network.
    """
    regular_feature_map = {match_and_venue(G): feature(G) for G in regular_networks()}
    df = pd.DataFrame(
        [
            {"game_str": G.name, "normal": regular_feature_map[match_and_venue(G)], "pp": feature(G)}
            for G in power_play_networks()
        ]
    )
    return df


def goal_labels() -> list:
    pp_info = pd.read_csv("external/Big-Data-Cup-2022/data/pp_info.csv", comment="#")
    str_to_int = {"t": 1, "f": 0}
    # {(game_name, pp_number): goal}
    pp_to_goal = {(row.game_name, row.penalty_number): str_to_int[row.goal] for row in pp_info.itertuples()}
    y = [pp_to_goal[(match(G), pp_number(G))] for G in power_play_networks()]
    return y

def shot_labels() -> list:
    


def get_X_y(feature: callable) -> pd.DataFrame:
    df = covariates(feature)
    df["goal"] = goal_labels()
    return df

In [4]:
get_X_y(network.assortativity).head(10)

Unnamed: 0,game_str,normal,pp,goal
0,2022-02-08 Canada at USA_home_pp2,0.135224,-0.342617,0
1,2022-02-08 Canada at USA_home_pp3,0.135224,-0.145563,0
2,2022-02-08 Canada at USA_home_pp4,0.135224,-0.358569,1
3,2022-02-08 Canada at USA_home_pp5,0.135224,0.121781,0
4,2022-02-08 Canada at USA_home_pp6,0.135224,-0.282587,0
5,2022-02-08 Canada at USA_away_pp1,0.0735,-0.25,1
6,2022-02-08 ROC at Finland_home_pp1,0.177693,-0.113516,1
7,2022-02-08 ROC at Finland_home_pp4,0.177693,0.213201,1
8,2022-02-08 ROC at Finland_home_pp6,0.177693,-0.31598,0
9,2022-02-08 ROC at Finland_away_pp2,-0.060644,1.0,0


In [5]:
def fit(feature: callable):
    df = get_X_y(feature)
    X = df[["normal", "pp"]]
    X = sm.add_constant(X)
    y = df["goal"]
    model = sm.Logit(y, X).fit()
    print(model.summary())

In [6]:
for feature in [network.assortativity, network.clustering, network.connectivity, network.number_connected_components]:
    print(f"Feature: {feature.__name__}")
    fit(feature)
    print("\n\n")
    1 + 2

Feature: assortativity
Optimization terminated successfully.
         Current function value: 0.573276
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   goal   No. Observations:                   33
Model:                          Logit   Df Residuals:                       30
Method:                           MLE   Df Model:                            2
Date:                Fri, 01 Dec 2023   Pseudo R-squ.:                 0.02163
Time:                        10:24:20   Log-Likelihood:                -18.918
converged:                       True   LL-Null:                       -19.336
Covariance Type:            nonrobust   LLR p-value:                    0.6581
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2899      0.631     -2.044      0.041      -2.527      -0.053
normal