In [256]:
from statsbombpy import sb
import pandas as pd
import numpy as np
import networkx as nx
from collections import Counter

In [257]:
events = sb.competition_events(
    country="Germany",
    division= "1. Bundesliga",
    season="2023/2024",
    gender="male"
)



In [258]:
#make a list to hold a df of the events of a match
df_dict = {}

for id in events.match_id.unique():
    #find one match for one team
    match_subset = events.loc[events['match_id'] == id]

    #we identify the starting 11
    starting_11 = match_subset.loc[match_subset['type'] == 'Starting XI'].loc[match_subset['team'] == 'Bayer Leverkusen', 'tactics'].to_list()[0]
    position_dict = {}
    
    #we make a dictionary for positions of players
    for member in starting_11['lineup']:
        player_id = int(member['player']['id'])
        position_name = member['position']['name']
        position_dict[player_id] = position_name


    #include only passes, shots, substitution or tactical shifts
    match_subset = match_subset.loc[(match_subset['type'].isin(['Shot','Pass','Substitution', 'Tactical Shift']))]

    #We include rows with Leverkusen as possession team or with type substitution or tactical shift
    match_subset = match_subset.loc[(match_subset['possession_team'] == 'Bayer Leverkusen') | (match_subset['type'].isin(['Substitution', 'Tactical Shift']))]

    #sort the values like when we did the passing sequences
    match_subset = match_subset.sort_values(['period','timestamp'], ascending=[True, True])
        
    match_subset['pass_recipient_position'] = np.nan

    for index, row in match_subset.iterrows():
        #If substitution, we update the dictionary to include player
        if row['type'] == 'Substitution' and row['team'] == 'Bayer Leverkusen':
            position_dict[row['substitution_replacement_id']] = row['position']

        #In case of a tactical shift, create a new position_dict
        if row['type'] == 'Tactical Shift' and row['team'] == 'Bayer Leverkusen':
            lineup = row['tactics']
            position_dict = {}
            for member in lineup['lineup']:
                player_id = int(member['player']['id'])
                position_name = member['position']['name']
                position_dict[player_id] = position_name

        elif row['pass_recipient_id'] in list(position_dict.keys()):
            #if the player is in our dictionary, we assign his position - else he may be a player of the opponent, and we ignore him
            match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
            
    #den næste linje kan eventuelt kommenteres ud, og så kan vi bare fjerne na i stedet, hvis vi ikke vil have de ekstra nodes, jeg foreslår :D
    #add shot as a pass recipient as well
    #match_subset.loc[match_subset['type'] =='Shot', 'pass_recipient_position'] = match_subset.apply(lambda x: np.where(pd.isna(x['pass_recipient_position']), x['shot_outcome'], x['pass_recipient_position']), axis=1)

    #if a does not have a pass_recipient_position yet, we drop it
    match_subset.dropna(subset=['pass_recipient_position'], inplace = True)
    df_dict[id] = match_subset 


  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position'] = position_dict[int(row['pass_recipient_id'])]
  match_subset['pass_recipient_position'] = np.nan
  match_subset.at[index, 'pass_recipient_position

In [259]:
#dictionary for the graphs
match_graph_dict = {}
#iterate through the dfs
for match in df_dict.values():

    match_id = match['match_id'].iloc[0]
    max_minute = match['minute'].max()
    #iterate from minute 0 to 15 minutes before end of game (it will not include the last 5 minutes, only the targets), and iterate every 5 minutes
    for start_minute in range(0, max_minute - 15, 5):
        end_minute = start_minute + 14 #define the interval we need to sort the graph for
        interval_df = match[(match['minute'] >= start_minute) & (match['minute'] <= end_minute)]
        graph = nx.DiGraph()
        edges = []

        #iterate through the passings of the match
        for node in pd.concat([match['position'], match['pass_recipient_position']], axis=0):
            if str(node) not in list(graph.nodes):
                #add the node to the graph
                graph.add_node(str(node))

        for passing in match[['position', 'pass_recipient_position']].itertuples():
            edge = (str(passing.position), str(passing.pass_recipient_position))
            edges.append(edge)

        #count the frequencies
        edges_counter = Counter(edges)

        edge_and_count = [(edge[0], edge[1], edges_counter[edge]) for edge in edges]

        #add weighted edges to graph
        if edge_and_count:
            graph.add_weighted_edges_from(edge_and_count)

        #add finished graph to dictionary. add 1 minute to the end of the time window to match the minute the prediction window starts (so the ids match)
        match_graph_dict[str(match_id)+str(end_minute+1)] = [graph]

In [260]:
targets = pd.read_csv("C:/Users/thors/Documents/GitHub/5-semester/momentum_data.csv")

In [261]:
#the graphs are in a list so just remove the list
for key, value in match_graph_dict.items():
    if isinstance(value, list) and len(value) == 1:
        match_graph_dict[key] = value[0] 

In [262]:
ids = targets["id"].to_list()
momentums = targets["momentum"].to_list()

#join the momentum value on each graph, matching the "match_id + time_interval" value
for i in ids:
    for key, value in match_graph_dict.items():
        if key == str(i):
            index = ids.index(i)
            value.graph["momentum"] = momentums[index]

In [263]:
import pickle
with open("Momentum graphs.pkl", "wb") as file:
    pickle.dump(match_graph_dict, file)

In [218]:
import torch
from torch import nn
from torch.nn import Linear, ReLU
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool, GATConv, GINConv, GATv2Conv
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import from_networkx
import networkx as nx


class GNN(nn.Module):
    def __init__(self, input_dim, hidden_channels):
        super(GNN, self).__init__()

        # Scale up the network size
        hidden_channels_gcn = hidden_channels * 2
        hidden_channels_gat = hidden_channels_gcn * 2
        hidden_channels_gin = hidden_channels_gat * 2

        # Initialize the layers
        self.conv1 = GCNConv(input_dim, hidden_channels_gcn)
        self.conv2 = GCNConv(hidden_channels_gcn, hidden_channels_gcn)
        self.gat_conv1 = GATv2Conv(hidden_channels_gcn, hidden_channels_gat)
        self.gat_conv2 = GATv2Conv(hidden_channels_gat, hidden_channels_gat)

        mlp = nn.Sequential(
            nn.Linear(hidden_channels_gat, hidden_channels_gin),
            nn.ReLU(),
            nn.Linear(hidden_channels_gin, hidden_channels_gin)
        )
        self.gin_conv1 = GINConv(mlp)

        # Output layer modified to accommodate the global variable
        self.out = nn.Linear(hidden_channels_gin + 1, 1)  # +1 for global_var

    def forward(self, x, edge_index, edge_weight, batch, global_var):
        # Pass data through each layer with ReLU activations and edge weights
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.relu(self.gat_conv1(x, edge_index))  # GAT layers do not use edge weights
        x = F.relu(self.gat_conv2(x, edge_index))
        x = F.relu(self.gin_conv1(x, edge_index))
        x = global_mean_pool(x, batch)

        # Concatenate global_var with pooled features
        x = torch.cat([x, global_var.view(-1, 1)], dim=-1)

        # Final output layer
        x = self.out(x)

        return x


AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)

In [None]:
from torch_geometric.utils import from_networkx
from torch_geometric.loader import DataLoader
import torch

data_list = []
for match_id, (graph, target,global_variable) in match_graph_dict.items():
    # Convert NetworkX graph to PyTorch Geometric Data
    data = from_networkx(graph)

    # Convert node features to tensor
    node_feature_values = list(nx.get_node_attributes(graph, 'pass_success').values())
    node_feature_values = [1 if value is None else value for value in node_feature_values]
    data.x = torch.tensor(node_feature_values, dtype=torch.float).view(-1, 1)

    # Ensure edge attributes are tensor
    data.edge_attr = torch.tensor(data.edge_attr, dtype=torch.float) if data.edge_attr is not None else None

    # Set the target as a tensor
    data.y = torch.tensor([target], dtype=torch.float)
    data.global_var = torch.tensor([global_variable], dtype=torch.float)
    data_list.append(data)


from sklearn.model_selection import train_test_split

# Split data_list into training and test sets (e.g., 80% train, 20% test)
train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42)


# Create DataLoader without the specified batch
train_loader = DataLoader(train_data,batch_size=4,shuffle=False)
test_loader = DataLoader(test_data,batch_size=4,shuffle=False)


# Create a DataLoader
#loader = DataLoader(data_list, batch_size=4, shuffle=False)

In [None]:
graph.edges.data()

OutEdgeDataView([('Right Defensive Midfield', 'Left Wing', {'weight': 7}), ('Right Defensive Midfield', 'Right Wing', {'weight': 8}), ('Right Defensive Midfield', 'Right Center Back', {'weight': 6}), ('Right Defensive Midfield', 'Right Wing Back', {'weight': 9}), ('Right Defensive Midfield', 'Left Defensive Midfield', {'weight': 10}), ('Right Defensive Midfield', 'Center Forward', {'weight': 4}), ('Right Defensive Midfield', 'Center Back', {'weight': 5}), ('Right Defensive Midfield', 'Left Center Back', {'weight': 3}), ('Right Defensive Midfield', 'Left Wing Back', {'weight': 2}), ('Left Wing', 'Left Defensive Midfield', {'weight': 7}), ('Left Wing', 'Left Center Back', {'weight': 2}), ('Left Wing', 'Right Wing Back', {'weight': 3}), ('Left Wing', 'Left Wing Back', {'weight': 6}), ('Left Wing', 'Right Wing', {'weight': 5}), ('Left Wing', 'Right Defensive Midfield', {'weight': 2}), ('Left Wing', 'Center Back', {'weight': 1}), ('Left Wing', 'Center Forward', {'weight': 3}), ('Left Defens

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import global_mean_pool
from torch.optim import Adam


# Assuming each node has 1 feature
input_dim = 1
hidden_channels = 5  # You can adjust this based on your needs

model = GNN(input_dim=input_dim, hidden_channels=hidden_channels)
criterion = torch.nn.MSELoss()  # Mean Squared Error for regression
optimizer = Adam(model.parameters(), lr=0.05)
torch.nn.init.constant_(model.out.bias, 0.00)
for epoch in range(20):
    model.train()
    total_loss = 0

    for batch_idx, data in enumerate(train_loader):
        if batch_idx == 5:  # Skip the 6th batch (batch index 5)
            print(f"Skipping batch {batch_idx+1}")
            continue  # Skip this iteration

        optimizer.zero_grad()
        
        # Forward pass with global_var
        out = model(data.x, data.edge_index, data.edge_weight,data.batch, data.global_var)
        loss = criterion(out.view(-1), data.y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss:.4f}')


Skipping batch 6
Epoch 1, Loss: 77.7242
Skipping batch 6
Epoch 2, Loss: 83.0027
Skipping batch 6
Epoch 3, Loss: 644.6365
Skipping batch 6
Epoch 4, Loss: 316.1588
Skipping batch 6
Epoch 5, Loss: 1007.4048
Skipping batch 6
Epoch 6, Loss: 2146.8955
Skipping batch 6
Epoch 7, Loss: 3274.7748
Skipping batch 6
Epoch 8, Loss: 464.1319
Skipping batch 6
Epoch 9, Loss: 1980.0850
Skipping batch 6
Epoch 10, Loss: 9486.7834
Skipping batch 6
Epoch 11, Loss: 13935.5453
Skipping batch 6
Epoch 12, Loss: 7371.7406
Skipping batch 6
Epoch 13, Loss: 800.4362
Skipping batch 6
Epoch 14, Loss: 3897.4392
Skipping batch 6
Epoch 15, Loss: 17598.5911
Skipping batch 6
Epoch 16, Loss: 34765.8921
Skipping batch 6
Epoch 17, Loss: 43198.5732
Skipping batch 6
Epoch 18, Loss: 34688.7368
Skipping batch 6
Epoch 19, Loss: 16867.2462
Skipping batch 6
Epoch 20, Loss: 3128.6485


In [None]:
from torch_geometric.loader import DataLoader
me = 0
for i in test_data:
    # Create a DataLoader with a single graph
    single_graph_loader = DataLoader([i], batch_size=1, shuffle=False)
    model.eval()
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for data in single_graph_loader:  # Using DataLoader
            # Forward pass
            output = model(data.x, data.edge_index,data.edge_weight, data.batch,data.global_var)
            print( output.item())
            print( data.y.item(),"\n")
            me += abs(output.item()-data.y.item())
print("average error: ",me/len(test_data))

2.8919014930725098
1.922119379043579 

4.048046588897705
1.9797840118408203 

3.325455904006958
2.9524571895599365 

4.2648234367370605
3.9359192848205566 

3.6144919395446777
2.0127344131469727 

4.698378086090088
1.8914011716842651 

2.8919014930725098
1.6519643068313599 

average error:  1.341231312070574
