In [3]:
from typing import List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

import network
from network import Weight
from preprocess_data import Game, PowerPlay, Situation, Venue, read_networks

In [4]:
def regular_networks():
    for nw in read_networks(Situation.REGULAR):
        yield nw


def power_play_networks():
    for nw in read_networks(Situation.POWER_PLAY):
        yield nw


def penalty_kill_networks():
    for nw in read_networks(Situation.PENALTY_KILL):
        yield nw

In [20]:
def fit(df: pd.DataFrame, covariates: List[str], target: str, logistic: bool = False):
    X = df[covariates]
    X = sm.add_constant(X)
    y = df[target]
    model = sm.Logit(y, X) if logistic else sm.OLS(y, X)
    res = model.fit()
    print(res.summary())

# Match Outcome Prediction

In [21]:
def get_win_df(feature: callable) -> pd.DataFrame:
    feature_name = feature.__name__
    df = pd.DataFrame(
        [
            {"game_str": G.name, feature_name: feature(G), "log_n_passes": np.log(G.n_passes), "win": G.win}
            for G in regular_networks()
        ]
    )
    return df

In [22]:
get_win_df(network.degree_mean)

Unnamed: 0,game_str,degree_mean,log_n_passes,win
0,2022-02-08 Canada at USA_reg_home,82.8,5.375278,0
1,2022-02-08 Canada at USA_reg_away,69.6,5.220356,1
2,2022-02-08 ROC at Finland_reg_home,98.8,5.57973,1
3,2022-02-08 ROC at Finland_reg_away,74.4,5.298317,0
4,2022-02-12 Switzerland at ROC_reg_home,102.0,5.583496,0
5,2022-02-12 Switzerland at ROC_reg_away,78.0,5.356586,1
6,2022-02-14 Switzerland at Canada_reg_home,127.2,5.802118,1
7,2022-02-14 Switzerland at Canada_reg_away,55.2,4.990433,0
8,2022-02-14 USA at Finland_reg_home,89.2,5.476464,0
9,2022-02-14 USA at Finland_reg_away,85.2,5.393628,1


We control for number of total passes as in the paper. For degree_mean however, it doesn't make sense to control for number of passes. Indeed, mean_degree $=2n^{-1}$ n_passes.

In [23]:
controls = {
    network.assortativity: ["log_n_passes"],
    network.clustering: ["log_n_passes"],
    network.degree_mean: [],
    network.degree_std: ["log_n_passes"],
}

We can ignore connectivity ($\gamma$) and number of strongly connected components ($\sigma$) because our networks are all regular. 

In [24]:
for feature in [
    network.assortativity,
    network.clustering,
    network.degree_mean,
    network.degree_std,
]:
    feature_name = feature.__name__
    print(f"Feature: {feature_name}")
    df = get_win_df(feature)
    if feature_name == "degree_mean":
        controls = []
    fit(df, covariates=[feature_name] + controls, target="win", logistic=True)
    print("\n\n")

Feature: assortativity
Optimization terminated successfully.
         Current function value: 0.604426
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    win   No. Observations:                   12
Model:                          Logit   Df Residuals:                        9
Method:                           MLE   Df Model:                            2
Date:                Sat, 09 Dec 2023   Pseudo R-squ.:                  0.1280
Time:                        15:29:55   Log-Likelihood:                -7.2531
converged:                       True   LL-Null:                       -8.3178
Covariance Type:            nonrobust   LLR p-value:                    0.3448
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const           -24.5086     19.818     -1.237      0.216     -63.351      14.3

Conclusion: doesn't work.

# Power Play Success Prediction

In [13]:
def match_and_venue(G) -> Tuple[Game, Venue]:
    return (G.game.game, G.venue)


def covariates(feature: callable) -> pd.DataFrame:
    """
    Build the design matrix where the first column is the feature of the normal play
    network and the second column is the feature of the powerplay network.
    """
    regular_feature_map = {match_and_venue(G): feature(G) for G in regular_networks()}
    regular_n_passes = {match_and_venue(G): G.size(weight=Weight.N_PASSES.value) for G in regular_networks()}
    df = pd.DataFrame(
        [
            {
                "game_str": G.name,
                "normal": regular_feature_map[match_and_venue(G)],
                "pp": feature(G),
                "normal_n_passes": regular_n_passes[match_and_venue(G)],
                "pp_n_passes": G.size(weight=Weight.N_PASSES.value),
            }
            for G in power_play_networks()
        ]
    )
    return df


def goal_labels() -> list:
    pp_info = pd.read_csv("external/Big-Data-Cup-2022/data/pp_info.csv", comment="#")
    str_to_int = {"t": 1, "f": 0}
    pp_to_goal = {(row.game_name, row.penalty_number): str_to_int[row.goal.strip()] for row in pp_info.itertuples()}
    y = [pp_to_goal[(G.game.game, G.pp.penalty_no)] for G in power_play_networks()]
    return y


def get_pp_success_df(feature: callable) -> pd.DataFrame:
    df = covariates(feature)
    df["goal"] = goal_labels()
    df.dropna(inplace=True)
    return df

In [14]:
get_X_y(network.assortativity)

  return (xy * (M - ab)).sum() / np.sqrt(vara * varb)


Unnamed: 0,game_str,normal,pp,normal_n_passes,pp_n_passes,goal
0,2022-02-08 Canada at USA_2_pp_home,0.120174,-0.441123,207.0,23.0,0
1,2022-02-08 Canada at USA_3_pp_home,0.120174,-0.098857,207.0,21.0,0
2,2022-02-08 Canada at USA_4_pp_home,0.120174,-0.828079,207.0,12.0,1
3,2022-02-08 Canada at USA_5_pp_home,0.120174,0.68313,207.0,11.0,0
4,2022-02-08 Canada at USA_6_pp_home,0.120174,-0.282587,207.0,29.0,0
5,2022-02-08 Canada at USA_1_pp_away,-0.102773,-0.25,174.0,11.0,1
6,2022-02-08 ROC at Finland_4_pp_home,0.041894,0.213201,247.0,10.0,1
7,2022-02-08 ROC at Finland_6_pp_home,0.041894,-0.492376,247.0,14.0,0
9,2022-02-08 ROC at Finland_5_pp_away,-0.120709,0.165145,186.0,12.0,0
10,2022-02-12 Switzerland at ROC_2_pp_home,-0.026035,-0.296007,255.0,21.0,0


In [6]:
for feature in [
    network.assortativity,
    network.clustering,
    network.eigenvector_centrality,
    network.betweenness_centrality,
    network.degree_mean,
    network.degree_std,
]:
    print(f"Feature: {feature.__name__}")
    fit(feature)
    print("\n\n")

  return (xy * (M - ab)).sum() / np.sqrt(vara * varb)


Feature: assortativity
Optimization terminated successfully.
         Current function value: 0.552914
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   goal   No. Observations:                   30
Model:                          Logit   Df Residuals:                       27
Method:                           MLE   Df Model:                            2
Date:                Sat, 09 Dec 2023   Pseudo R-squ.:                 0.04656
Time:                        14:29:54   Log-Likelihood:                -16.587
converged:                       True   LL-Null:                       -17.397
Covariance Type:            nonrobust   LLR p-value:                    0.4448
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8509      0.556     -1.530      0.126      -1.941       0.239
normal

Optimization terminated successfully.
         Current function value: 0.569427
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                   goal   No. Observations:                   31
Model:                          Logit   Df Residuals:                       28
Method:                           MLE   Df Model:                            2
Date:                Sat, 09 Dec 2023   Pseudo R-squ.:                0.002794
Time:                        14:29:54   Log-Likelihood:                -17.652
converged:                       True   LL-Null:                       -17.702
Covariance Type:            nonrobust   LLR p-value:                    0.9517
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7736      2.236     -0.346      0.729      -5.155       3.608
normal        -0.9300      3.

PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')