In [49]:
import networkx as nx
from itertools import combinations
from networkx.algorithms.isomorphism import DiGraphMatcher
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules

from statsbombpy import sb
import pandas as pd
from helper_functions import(match_ids,
                             create_graphs,
                             create_graphs_dict,
                             )


import networkx as nx
from networkx.algorithms.isomorphism import DiGraphMatcher
from itertools import combinations
import hashlib

# Helper function to generate a canonical label (hashable identifier) for a graph
def graph_canonical_label(graph):
    # Generate a string representation using edgelist with sorted edges
    edge_list = sorted(graph.edges(data=True))
    edge_str = str(edge_list)
    return hashlib.md5(edge_str.encode('utf-8')).hexdigest()

# Frequent singletons (edges with attributes)
def frequent_singletons(min_sup, edge_matrix):
    items_counted = {}
    edge_attributes = {}

    for edge_list in edge_matrix:
        for edge in edge_list:
            edge_key = (edge[0], edge[1], tuple(sorted(edge[2].items())))
            items_counted[edge_key] = items_counted.get(edge_key, 0) + 1
            edge_attributes[edge_key] = edge[2]

    F = [key for key, value in items_counted.items() if value >= min_sup]
    F_graphs = []
    for edge_key in F:
        g = nx.DiGraph()
        source = edge_key[0]
        target = edge_key[1]
        attributes = edge_attributes[edge_key]
        g.add_edge(source, target, **attributes)
        F_graphs.append(g)
    
    return F_graphs

# Candidate generation by joining frequent subgraphs of size k-1
def generate_candidates(F, k):
    candidates = set()
    for g1, g2 in combinations(F, 2):
        if g1.edges() != g2.edges():  # Make sure they are not identical
            common_nodes = set(g1.nodes()) & set(g2.nodes())
            if len(common_nodes) > 0:
                union_graph = nx.compose(g1, g2)
                if union_graph.number_of_edges() == k:
                    candidates.add(union_graph)  # Store the actual graph, not the hash
    return candidates
# Count the support for each candidate in the graph database
def count_support(C, graph_db):
    F_count = {}

    for graph in graph_db:
        for candidate in C:
            GM = DiGraphMatcher(graph, candidate, edge_match=lambda x, y: x == y)
            if GM.subgraph_is_isomorphic():
                candidate_label = graph_canonical_label(candidate)  # Generate hash for storage
                F_count[candidate] = F_count.get(candidate, 0) + 1  # Store support by graph object

    return F_count

# Filter frequent candidates based on minimum support
def filter_frequent(F_count, min_sup, graph_db_size):
    frequent_graphs = []
    stats = {}

    for candidate_graph, support_AB in F_count.items():
        if support_AB >= min_sup:
            support_A = support_AB
            confidence = support_AB / support_A if support_A > 0 else 0
            lift = (support_AB / graph_db_size) / ((support_A / graph_db_size) ** 2) if support_A > 0 else 0
            leverage = (support_AB / graph_db_size) - ((support_A / graph_db_size) ** 2)
            conviction = (1 - (support_A / graph_db_size)) / (1 - confidence) if (1 - confidence) > 0 else 0

            stats[candidate_graph] = {
                'support': support_AB,
                'confidence': confidence,
                'lift': lift,
                'leverage': leverage,
                'conviction': conviction
            }
            frequent_graphs.append(candidate_graph)  # Use the actual graph object, not the label

    return frequent_graphs, stats

# Remove duplicate isomorphic graphs
def remove_duplicates(frequent_total):
    unique_graphs = []

    for graph in frequent_total:
        is_duplicate = False
        
        # Check against all graphs already in the unique list
        for unique_graph in unique_graphs:
            # Use DiGraphMatcher with edge attribute comparison
            GM = DiGraphMatcher(graph, unique_graph, edge_match=lambda x, y: x == y)
            if GM.is_isomorphic():
                is_duplicate = True
                break
        
        if not is_duplicate:
            unique_graphs.append(graph)

    return unique_graphs

# Main function for Apriori-based graph mining
def apriori_graph_mining(min_sup, edge_matrix, graph_db, max_k):
    frequent_total = []
    stats_total = {}

    # Step 1: Find frequent singletons (edges)
    F = frequent_singletons(min_sup, edge_matrix)
    frequent_total.extend(F)
    
    k = 2  # Start with size-2 subgraphs
    while k <= max_k:
        print(f"\nIteration {k}:")
        
        # Step 2: Generate candidate subgraphs of size k
        C = generate_candidates(F, k)
        
        if not C:
            print(f"No candidates found for size {k}. Terminating.")
            break
        
        # Step 3: Count support for each candidate in the graph database
        F_count = count_support(C, graph_db)
        
        # Step 4: Filter out frequent candidates that meet the minimum support
        F, stats = filter_frequent(F_count, min_sup, len(graph_db))
        
        if not F:
            print(f"No frequent subgraphs found for size {k}. Terminating.")
            break
        
        # Add frequent subgraphs and their statistics to the total list
        frequent_total.extend(F)  # Make sure we're adding the actual graphs, not hashes
        stats_total.update(stats)
        
        print(f"Frequent subgraphs of size {k}:")
        for subgraph in F:
            print(f"Subgraph: {subgraph}")
        
        k += 1  # Move to the next size of subgraphs

    # Remove duplicate subgraphs before returning
    frequent_total = remove_duplicates(frequent_total)

    return frequent_total, stats_total


In [46]:
k =2
events = sb.competition_events(
    country="Germany",
    division= "1. Bundesliga",
    season="2023/2024",
    gender="male"
)




In [50]:

df = match_ids(events, "Bayer Leverkusen", season_id=281, competition_id=9)
possesion, final_sequence = create_graphs(df, xG=0.01, min_passes=1, 
                                          x_cordinate=30, y_cordinate=30)
graph_list, graph_dict = create_graphs_dict(possesion, final_sequence)


graph_list_sample = graph_list

# Create a list of edges from the sampled graph_list
edge_matrix = [list(graph.edges(data=True)) for graph in graph_list]
GRAPH_DB = graph_list_sample  # List of graphs in the database
min_sup = 0
xG = 0.5
min_passes = 5

  sequences_sorted['possession_id'] = sequences_sorted['match_id'].astype(str) + sequences_sorted['possession'].astype(str)
  sequences_filtered['xg'] = sequences_filtered.groupby('possession_id')['shot_statsbomb_xg'].transform(lambda group: group.fillna(method='ffill').fillna(method='bfill'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sequences_filtered['xg'] = sequences_filtered.groupby('possession_id')['shot_statsbomb_xg'].transform(lambda group: group.fillna(method='ffill').fillna(method='bfill'))
  sequences_filtered['end_location'] = sequences_filtered['location'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pan

In [51]:
frequent_subgraphs, stats_total = apriori_graph_mining(10, edge_matrix, GRAPH_DB, 40)
final_sequence



Iteration 2:
Frequent subgraphs of size 2:
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 2 nodes and 2 edges
Subgraph: DiGraph with 2 nodes and 2 edges
Subgraph: DiGraph with 2 nodes and 2 edges
Subgraph: DiGraph with 2 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: DiGraph with 3 nodes and 2 edges
Subgraph: 

In [52]:
# Iterate over the frequent subgraphs
for subgraph in frequent_subgraphs:
    # Get the edges of the subgraph
    edges = list(subgraph.edges(data=True))
    
    # Get the corresponding statistics for this subgraph
    stats = stats_total.get(subgraph, {})
    
    # Print the edges of the subgraph
    print("Subgraph Edges:")
    for u, v, attr in edges:
        print(f"Edge {u} -> {v}, Attributes: {attr}")
    
    # Print the associated statistics
    print("Statistics:")
    print(f"Support: {stats.get('support', 'N/A')}")
    print(f"Confidence: {stats.get('confidence', 'N/A')}")
    print(f"Lift: {stats.get('lift', 'N/A')}")
    print(f"Leverage: {stats.get('leverage', 'N/A')}")
    print(f"Conviction: {stats.get('conviction', 'N/A')}")
    print(f"xg: {stats.get('xg', 'N/A')}")
    print("\n" + "-"*40 + "\n")  # Separator between subgraphs

subgraph_data = []

# Iterate over the frequent subgraphs
for subgraph in frequent_subgraphs:
    # Get the edges of the subgraph
    edges = list(subgraph.edges(data=True))
    
    # Get the corresponding statistics for this subgraph
    stats = stats_total.get(subgraph, {})
    
    # Convert the edges and stats into a dictionary
    subgraph_info = {
    'edges': sorted([(u, v, attr) for u, v, attr in edges], key=lambda x: x[2].get('sequence', float('inf'))),
    'support': stats.get('support'),
    'confidence': stats.get('confidence'),
    'lift': stats.get('lift'),
    'leverage': stats.get('leverage'),
    'conviction': stats.get('conviction'),
}

    
    # Append the subgraph data to the list
    subgraph_data.append(subgraph_info)

# Create a DataFrame from the collected data
df_subgraphs = pd.DataFrame(subgraph_data)


Subgraph Edges:
Edge 3.2 -> 4.2, Attributes: {'sequence': 2}
Statistics:
Support: N/A
Confidence: N/A
Lift: N/A
Leverage: N/A
Conviction: N/A
xg: N/A

----------------------------------------

Subgraph Edges:
Edge 2.2 -> 2.3, Attributes: {'sequence': 4}
Statistics:
Support: N/A
Confidence: N/A
Lift: N/A
Leverage: N/A
Conviction: N/A
xg: N/A

----------------------------------------

Subgraph Edges:
Edge 3.2 -> 4.2, Attributes: {'sequence': 3}
Statistics:
Support: N/A
Confidence: N/A
Lift: N/A
Leverage: N/A
Conviction: N/A
xg: N/A

----------------------------------------

Subgraph Edges:
Edge 4.3 -> 3.3, Attributes: {'sequence': 1}
Statistics:
Support: N/A
Confidence: N/A
Lift: N/A
Leverage: N/A
Conviction: N/A
xg: N/A

----------------------------------------

Subgraph Edges:
Edge 3.2 -> 4.2, Attributes: {'sequence': 2}
Edge 2.2 -> 3.2, Attributes: {'sequence': 3}
Statistics:
Support: 85
Confidence: 1.0
Lift: 2.811764705882353
Leverage: 0.22916265471542865
Conviction: 0
xg: N/A

-----

In [None]:
df_subgraphs

In [None]:
GRAPH_DB