In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

from preprocess_data import *

import matplotlib.pyplot as plt

In [None]:
play_by_play_data = pd.read_csv(os.path.join(TRACKING_DIR, PLAY_BY_PLAY_DATA_FILE))
power_play_info = pd.read_csv(os.path.join(PBP_DIR, POWER_PLAY_INFO_FILE), comment="#")

In [None]:
def regular():
    for game, venue in itertools.product(
        [Game(game) for game in GAMES],
        [Venue.HOME, Venue.AWAY],
    ):
        events = play_by_play_data[play_by_play_data["game_date"] == game.game_date]
        match venue:
            case venue.HOME:
                events = events[events["team_name"] == game.home]
            case venue.AWAY:
                events = events[events["team_name"] == game.away]
    
        events = events[events["situation_type"] == Situation.REGULAR.value]
        
        passes = events[events["event"] == "Play"]
        shots = events[events["event"] == "Shot"]

        time = calculate_time(game, None)
    
        yield (len(passes) / time, len(shots) / time)

In [None]:
def power_play():
    for game, pp in itertools.product(
        [Game(game) for game in GAMES],
        [PowerPlay(penalty_no) for penalty_no in range(1, MAX_PENALTY_NUMBER + 1)],
        # [Venue.HOME, Venue.AWAY],
    ):
        events = play_by_play_data[play_by_play_data["game_date"] == game.game_date]
        events = events[events["situation_type"] == Situation.POWER_PLAY.value]

        
        # Find the unique* row in power_play_info corresponding to the current game and penalty_number in pp.
        # If it does not exist; just return None because there is no network to build.
        try:
            pp_df = power_play_info[
                (power_play_info["game_name"] == game.game) & (power_play_info["penalty_number"] == pp.penalty_no)
            ].iloc[0]
        except IndexError:
            # Skip
            continue

        # NOTE: Wrote custom logic to determine plays that happen as part of a PP because the time calculation stuff from the Data_Clean.ipynb notebook did not seem to work correctly; maybe I just made a mistake though.
        if pp_df["start_period"] != pp_df["end_period"]:
            # Take events that are either in the start_period and the clock is below the PP (clock is counting down)
            events = events[
                (
                    (events["period"] == pp_df["start_period"])
                    & (pp_df["start_game_clock_seconds"] >= events["clock_seconds"])
                )
                # or events in the end_period with the clock above the end time of the PP
                | (
                    (events["period"] == pp_df["end_period"])
                    & (events["clock_seconds"] >= pp_df["end_game_clock_seconds"])
                )
            ]

        else:
            # Take events in the same period (start_period == end_period except for the one exception)
            # and PP-Start above current time and PP-END below current time
            events = events[
                (events["period"] == pp_df["start_period"])
                & (pp_df["start_game_clock_seconds"] >= events["clock_seconds"])
                & (events["clock_seconds"] >= pp_df["end_game_clock_seconds"])
            ]

        # HACK: Assume the team with more events during the PowerPlay is the one doing it.
        # This way, power_play() only returns 36 values, one for each PP, instead of 72 with a bunch of 0 inside
        # (Actually, 33 because three power plays do not have situation_type == "5 on 5", so they are being filtered)
        home_event_cnt = len(events[events["team_name"] == game.home])
        away_event_cnt = len(events[events["team_name"] == game.away])

        if home_event_cnt > away_event_cnt:
            events = events[events["team_name"] == game.home]
        else:
            events = events[events["team_name"] == game.away]
        
        passes = events[events["event"] == "Play"]
        shots = events[events["event"] == "Shot"]

        time = calculate_time(game, [pp])

        if len(passes) > 0:
            yield (len(passes) / time, len(shots) / time)

In [None]:
plt.scatter(*zip(*regular()))
plt.show()
x_reg, y_reg = np.array([*zip(*regular())])
print(np.corrcoef(x_reg, y_reg))
x_reg = x_reg.reshape(-1, 1)
lm = LinearRegression().fit(x_reg, y_reg)
lm.score(x_reg, y_reg)

In [None]:
plt.scatter(*zip(*power_play()))
plt.show()
x_pp, y_pp = np.array([*zip(*power_play())])
# TODO: Correlation is *lower*, but I do not know how to interpret that yet.
print(np.corrcoef(x_pp, y_pp))
x_pp = x_pp.reshape(-1, 1)
lm = LinearRegression().fit(x_pp, y_pp)
lm.score(x_pp, y_pp)