In [None]:
from typing import Callable, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import pearsonr

import network
from network import Weight
from preprocess_data import Game, PowerPlay, Situation, Venue, read_networks

In [None]:
def regular_networks():
    return read_networks(Situation.REGULAR)


def power_play_networks():
    return read_networks(Situation.POWER_PLAY)


def penalty_kill_networks():
    return read_networks(Situation.PENALTY_KILL)

In [None]:
def fit(df: pd.DataFrame, covariates: List[str], target: str, family: str = "binomial"):
    X = df[covariates]
    X = sm.add_constant(X)
    y = df[target]
    match family:
        case "binomial":
            model = sm.Logit(y, X)
        case "poisson":
            model = sm.GLM(y, X, family=sm.families.Poisson())
        case "gaussian":
            model = sm.OLS(y, X)
        case _:
            raise ValueError(f"{family} regression not implemented")
    res = model.fit()
    print(res.summary())

# Match Outcome Prediction

In [None]:
def get_win_df(features: List[Callable] | Callable) -> pd.DataFrame:
    """
    Builds a dataframe of features for each game using the regular situations (no Power Play and Penalty Kill).

    Args:
        features (List[Callable] | Callable): features to be computed for each game

    Returns:
        pd.DataFrame: Dataframe with columns game_str, log_n_passes, win, and features
    """
    if callable(features):
        features = [features]
    df = pd.DataFrame(
        [
            {"game_str": G.name, "n_passes": G.n_passes, "log_n_passes": np.log(G.n_passes), "win": G.win}
            | {feature.__name__: feature(G) for feature in features}
            for G in regular_networks()
        ]
    )
    return df

In [None]:
features = [
    network.assortativity,
    network.clustering,
    network.degree_mean,
    network.degree_std,
]
df = get_win_df(features)
df

We control for number of total passes as in the paper. For degree_mean however, it doesn't make sense to control for number of passes. Indeed, `degree_mean` $=2n^{-1}$ `n_passes` and they are thus highly correlated.

In [None]:
pearsonr(df["n_passes"], df["degree_mean"]).statistic

It would not be surprising if `degree_std` also was highly correlated with `n_passes` as we are dealing with non-negative count data (number of passes). However:

In [None]:
pearsonr(df["n_passes"], df["degree_std"]).statistic

In [None]:
controls = {
    network.assortativity: ["log_n_passes"],
    network.clustering: ["log_n_passes"],
    network.degree_mean: [],
    network.degree_std: ["log_n_passes"],
}

We don't look at connectivity ($\gamma$) and number of strongly connected components ($\sigma$) because our networks are all regular. 

In [None]:
for feature in features:
    feature_name = feature.__name__
    print(f"Feature: {feature_name}")
    df = get_win_df(feature)
    df.dropna(inplace=True)
    fit(df, covariates=[feature_name] + controls[feature], target="win", family="binomial")
    print("\n\n")

Conclusion: doesn't work. There are two possible explanations:
- We are using too little data
- The investigated features are not predictive

We have data from six games, giving twelve data points. Hence, it would not be surprising if that is not enough data.

# Power Play Success Prediction (Goal or nah)

In [None]:
def game_and_venue(G) -> Tuple[Game, Venue]:
    return (G.game.game, G.venue)


def get_pp_df(features: List[Callable] | Callable) -> pd.DataFrame:
    """
    Builds a dataframe of features for each Power Play for regressing on the success of the Power Play (goal or not) or
    the number of shots.

    Args:
        features (List[Callable] | Callable): features to be computed for each PP

    Returns:
        pd.DataFrame: Dataframe with columns game_str, log_n_passes, target, normal_feature_1, ..., normal_feature_n, pp_feature_1, ..., pp_feature_n
    """
    if callable(features):
        features = [features]
    regular_feature_map = {game_and_venue(G): {feature: feature(G) for feature in features} for G in regular_networks()}
    df = pd.DataFrame(
        [
            {
                "game_str": G.name,
                "n_passes": G.n_passes,
                "log_n_passes": np.log(G.n_passes),
                "goal": G.goal,
                "n_shots": G.n_shots,
            }
            | {f"normal_{feature.__name__}": regular_feature_map[game_and_venue(G)][feature] for feature in features}
            | {f"pp_{feature.__name__}": feature(G) for feature in features}
            for G in power_play_networks()
        ]
    )
    return df

In [None]:
get_pp_df(features).head(7)

In [None]:
for feature in features:
    feature_name = feature.__name__
    print(f"Feature: {feature_name}")
    df = get_pp_df(feature)
    df.dropna(inplace=True)
    covariates = [f"normal_{feature_name}", f"pp_{feature_name}"] + controls[feature]
    fit(df, covariates=covariates, target="goal", family="binomial")
    print("\n\n")

Also doesn't work.

# Predicting Number of Shots

Let's first see if `n_passes` predicts `n_shots`.

In [None]:
df = get_pp_df(features)
fit(df, covariates=["n_passes"], target="n_shots", family="poisson")

It does. The number of passes is a significant predictor of the number of shots (p<0.0005). The coefficient is 0.0563, which means that all else equal, for every additional pass, we expect an increase in the number of shots by a factor of $e^{0.0563}\approx 0.057$ (remember that we use Poisson regression). This means for every ~18 passes a team will have an extra shot.

Let's see if the other network features are also predictive of the number of shots.

In [None]:
for feature in features:
    feature_name = feature.__name__
    print(f"Feature: {feature_name}")
    df = get_pp_df(feature)
    df.dropna(inplace=True)
    covariates = [f"pp_{feature_name}"] + controls[feature]
    fit(df, covariates=covariates, target="n_shots", family="poisson")
    print("\n\n")

`degree_mean` is a significant predictor of `n_shots`. This is not surprising given the above as `degree_mean` is highly correlated with `n_passes`.

On the other hand, `assortativity` and `clustering` are also not predictive in this task.

We can also include the features from regular play (not power play) and see if the features become more predictive by being able to look at the contrast between normal play and power play.

In [None]:
for feature in features:
    feature_name = feature.__name__
    print(f"Feature: {feature_name}")
    df = get_pp_df(feature)
    df.dropna(inplace=True)
    covariates = [f"normal_{feature_name}", f"pp_{feature_name}"] + controls[feature]
    fit(df, covariates=covariates, target="n_shots", family="poisson")
    print("\n\n")