In [1]:
!pip install statsbombpy transformers fastparquet pyarrow
from statsbombpy import sb
import pandas as pd





In [2]:
events = sb.competition_events(
    country="England",
    division= "Premier League",
    season="2015/2016",
    gender="male"
)
# Sort the events DataFrame by minute and second
events = events.sort_values(by=["minute", "second"])

# Display the first few rows of the filtered and sorted DataFrame
print(events.head())




KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import re
import numpy as np

def extract_template_fields(template):
    """Extract field names from a template string using regex."""
    return set(re.findall(r'\{(\w+)\}', template))

def is_valid_value(value):
    """Check if a value is valid (not None, nan, or empty)."""
    if isinstance(value, (list, np.ndarray)):
        return len(value) > 0
    return pd.notna(value)

def format_value(value):
    """Format values for display, handling arrays and other types."""
    if isinstance(value, (list, np.ndarray)):
        return str(value)
    return str(value)

def get_player_info(row):
    """Extract player ID and name from row."""
    try:
        if isinstance(row.get('player'), dict):
            player_id = str(row['player'].get('id', 'unknown'))
            player_name = row['player'].get('name', 'unknown')
        else:
            player_id = str(row.get('player_id', 'unknown'))
            player_name = row.get('player', 'unknown')
        return player_id, player_name
    except:
        return 'unknown', 'unknown'

def generate_natural_language(row, event_type):
    """Generate natural language description for an event with proper error handling."""
    template = event_templates.get(event_type, "An event of type {type} occurred.")
    required_fields = extract_template_fields(template)
    
    values = {}
    for field in required_fields:
        try:
            value = row.get(field)
            if is_valid_value(value):
                values[field] = format_value(value)
            else:
                values[field] = "unknown"
        except (KeyError, AttributeError):
            values[field] = "unknown"
    
    try:
        # Add timestamp to the description to help verify chronological order
        timestamp = row.get('timestamp', 'unknown')
        minute = row.get('minute', 'unknown')
        second = row.get('second', 'unknown')
        time_prefix = f"[{minute}:{second} - {timestamp}] "
        return time_prefix + template.format_map(values)
    except KeyError as e:
        return f"Error generating description for {event_type}: missing field {str(e)}"

def process_events(events_df, output_dir='event_descriptions'):
    """Process events DataFrame and write descriptions to files in chronological order."""
    os.makedirs(output_dir, exist_ok=True)
    
    open_files = {}
    try:
        for _, row in events_df.iterrows():
            if row.get('position') == "Goalkeeper":
                continue
                
            event_type = row.get('type')
            if event_type not in event_templates:
                continue
                
            player_id, player_name = get_player_info(row)
            match_id = str(row.get('match_id', 'unknown'))
            
            filename = f"{player_name.replace(' ', '_')}_{player_id}_{match_id}.txt"
            filepath = os.path.join(output_dir, filename)
            
            if filename not in open_files:
                open_files[filename] = open(filepath, 'w')
            
            description = generate_natural_language(row, event_type)
            open_files[filename].write(description + '\n')
            open_files[filename].flush()
    
    finally:
        for file in open_files.values():
            file.close()

event_templates = {
    "Pass": (
        "A {pass_type} pass with {pass_body_part} was made from [x, y] coordinates: {location} "
        "to [x, y] coordinates: {pass_end_location}. The pass was {pass_outcome}. "
        "Pass length: {pass_length} meters. Counterpress: {counterpress}. "
        "Under pressure: {under_pressure}."
    ),
    "Shot": (
        "A {shot_type} shot with {shot_body_part} was taken at [x, y] coordinates: {location}. "
        "The shot was a {shot_outcome}, with xG: {shot_statsbomb_xg}. "
        "First time: {shot_first_time}, One-on-one: {shot_one_on_one}. "
        "Under pressure: {under_pressure}."
    ),
    "Block": (
        "A block occurred at [x, y] coordinates: {location}. Counterpress: {counterpress}. "
        "Play pattern: {play_pattern}. Out: {out}."
    ),
    "Ball Recovery": (
        "A ball recovery happened at [x, y] coordinates: {location}. Recovery failure: {ball_recovery_recovery_failure}. "
        "Play pattern: {play_pattern}. Under pressure: {under_pressure}."
    ),
    "Miscontrol": (
        "A miscontrol occurred at [x, y] coordinates: {location}. Out: {out}. "
        "Play pattern: {play_pattern}. Under pressure: {under_pressure}."
    ),
    "Clearance": (
        "A clearance occurred at [x, y] coordinates: {location} with {clearance_body_part}. "
        "Aerial won: {clearance_aerial_won}. Out: {out}. "
        "Play pattern: {play_pattern}. Under pressure: {under_pressure}."
    ),
    "Dribbled Past": (
        "A player was dribbled past at [x, y] coordinates: {location}. Counterpress: {counterpress}. "
        "Play pattern: {play_pattern}."
    ),
    "Dribble": (
        "A dribble was attempted at [x, y] coordinates: {location}. Outcome: {dribble_outcome}, "
        "Overrun: {dribble_overrun}. Play pattern: {play_pattern}. "
        "Under pressure: {under_pressure}."
    ),
    "Interception": (
        "An interception occurred at [x, y] coordinates: {location}. Outcome: {interception_outcome}. "
        "Counterpress: {counterpress}. Play pattern: {play_pattern}."
    ),
    "Foul Committed": (
        "A foul was committed at [x, y] coordinates: {location}. Type: {foul_committed_type}, "
        "Card: {foul_committed_card}, Offensive: {foul_committed_offensive}, "
        "Penalty: {foul_committed_penalty}. Counterpress: {counterpress}. "
        "Play pattern: {play_pattern}."
    ),
    "Foul Won": (
        "A foul was won at [x, y] coordinates: {location}. Defensive: {foul_won_defensive}, "
        "Penalty: {foul_won_penalty}. Play pattern: {play_pattern}. "
        "Under pressure: {under_pressure}."
    ),
    "Shield": (
        "A shield occurred at {location}. Play pattern: {play_pattern}."
    ),
}

# Usage
if __name__ == "__main__":
    process_events(events)
    print("Processing complete. Check the 'event_descriptions' directory for output files.")

In [2]:
!pip install transformers





In [3]:
import os
from transformers import AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

def setup_device():
    """
    Configure the appropriate device (GPU/CPU) for processing.
    Returns the device and a boolean indicating if CUDA is available.
    """
    cuda_available = torch.cuda.is_available()
    device = torch.device("cuda" if cuda_available else "cpu")
    
    if cuda_available:
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        print("Using CPU")
    
    return device

def read_and_embed_player_matches(event_descriptions_dir, model_name='jinaai/jina-embeddings-v2-small-en', batch_size=32):
    """
    Read player match files, generate embeddings, and create a structured DataFrame.
    Includes performance optimizations and player filtering.
    """
    device = setup_device()
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model = model.to(device)  # Move model to GPU if available
    model.eval()
    
    # First pass: count matches per player
    player_match_counts = Counter()
    files = [f for f in os.listdir(event_descriptions_dir) if f.endswith('.txt')]
    
    for filename in files:
        player_name = filename.split('_')[0]
        player_match_counts[player_name] += 1
    
    # Filter players with fewer than 10 matches
    qualified_players = {player for player, count in player_match_counts.items() 
                        if count >= 10}
    
    # Filter files for qualified players
    qualified_files = [f for f in files if f.split('_')[0] in qualified_players]
    print(f"Processing {len(qualified_files)} files from {len(qualified_players)} qualified players")
    
    results = []
    # Process in batches for GPU efficiency
    for i in tqdm(range(0, len(qualified_files), batch_size), desc="Processing batches"):
        batch_files = qualified_files[i:i + batch_size]
        batch_texts = []
        batch_info = []
        
        # Read batch of files
        for filename in batch_files:
            player_name = filename.split('_')[0]
            player_id = filename.split('_')[1]
            match_id = filename.split('_')[2].replace('.txt', '')
            
            with open(os.path.join(event_descriptions_dir, filename), 'r') as f:
                match_text = f.read()
            
            batch_texts.append(match_text)
            batch_info.append({
                'player_name': player_name,
                'player_id': player_id,
                'match_id': match_id,
                'text': match_text
            })
        
        # Generate embeddings for batch
        with torch.no_grad():
            embeddings = model.encode(batch_texts)
            # Only move to CPU if embeddings are PyTorch tensors
            if isinstance(embeddings, torch.Tensor):
                embeddings = embeddings.cpu().numpy()  # Move back to CPU for storage

        # Add results
        for info, embedding in zip(batch_info, embeddings):
            info['embedding'] = embedding
            results.append(info)

    
    df = pd.DataFrame(results)
    
    # Add match count for each player
    df['match_count'] = df['player_name'].map(player_match_counts)
    
    return df

def save_embeddings(df, output_path='player_match_embeddings.parquet'):
    """Save the DataFrame with embeddings efficiently."""
    # Convert embeddings to list format for storage
    df['embedding'] = df['embedding'].apply(lambda x: x.tolist())
    df.to_parquet(output_path)
    print(f"Embeddings saved to {output_path}")

In [2]:
# Example usage with performance monitoring
import time

start_time = time.time()

# Process files and generate embeddings
embeddings_df = read_and_embed_player_matches(
    'event_descriptions',
    batch_size=4  # Adjust based on your GPU memory
)

# Save results
save_embeddings(embeddings_df)

print(f"Processing completed in {time.time() - start_time:.2f} seconds")
print(f"Total players: {embeddings_df['player_name'].nunique()}")
print(f"Total matches: {len(embeddings_df)}")

NameError: name 'read_and_embed_player_matches' is not defined

In [5]:
!pip install pyarrow fastparquet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyarrow
  Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m206.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m167.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m170.2 MB/

In [2]:
import os
from transformers import AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

def mean_pooling(model_output):
    """
    Perform mean pooling on the token embeddings to get a single document embedding.
    The attention mask is used to avoid including padding tokens in the mean.
    """
     # Get token embeddings from the model output
    token_embeddings = model_output[0]  # Shape: (batch_size, seq_length, hidden_size)
    
    # Expand attention mask to match embedding dimensions
    # Original mask shape: (batch_size, seq_length)
    # New mask shape: (batch_size, seq_length, hidden_size)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    
    # Multiply embeddings by mask to zero out padding tokens
    # Then sum up all token embeddings for each sequence
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
    
    # Count the non-padding tokens for each sequence
    # Add small epsilon to avoid division by zero
    sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
    
    # Calculate mean by dividing sum of embeddings by number of non-padding tokens
    mean_embeddings = sum_embeddings / sum_mask
    
    # L2 normalize the embeddings
    normalized_embeddings = F.normalize(mean_embeddings, p=2, dim=1)
    
    return normalized_embeddings
    

def setup_device():
    """
    Configure the appropriate device (GPU/CPU) for processing.
    Returns the device and a boolean indicating if CUDA is available.
    """
    cuda_available = torch.cuda.is_available()
    device = torch.device("cuda" if cuda_available else "cpu")
    
    if cuda_available:
        print("Using GPU:", torch.cuda.get_device_name(0))
        torch.cuda.set_per_process_memory_fraction(0.7)
    else:
        print("Using CPU")
    
    return device

def read_and_embed_player_matches(event_descriptions_dir, model_name='jinaai/jina-embeddings-v2-small-en', 
                                batch_size=32, max_seq_length=2048):
    """
    Read player match files, generate embeddings, and create a structured DataFrame.
    Returns a single embedding per document using mean pooling.
    """
    device = setup_device()
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model = model.to(device)
    model.eval()
    
    # First pass: count matches per player using consistent player name parsing
    player_match_counts = Counter()
    files = [f for f in os.listdir(event_descriptions_dir) if f.endswith('.txt')]
    
    for filename in files:
        # Use the same parsing logic as in the main processing
        player_id = filename.split('_')[-2]
        player_match_counts[player_id] += 1
    
    qualified_players = {player for player, count in player_match_counts.items() 
                        if count >= 10}
    
    qualified_files = [f for f in files if f.split('_')[-2] in qualified_players]
    print(f"Processing {len(qualified_files)} files from {len(qualified_players)} qualified players")
    
    results = []
    try:
        for i in tqdm(range(0, len(qualified_files), batch_size), desc="Processing batches"):
            batch_files = qualified_files[i:i + batch_size]
            batch_texts = []
            batch_info = []
            
            # Read batch of files
            for filename in batch_files:
                player_name = ' '.join(filename.split('_')[0:-2])

                player_id = filename.split('_')[-2]
                match_id = filename.split('_')[-1].replace('.txt', '')
                
                with open(os.path.join(event_descriptions_dir, filename), 'r') as f:
                    match_text = f.read()
                
                batch_texts.append(match_text)
                batch_info.append({
                    'player_name': player_name,
                    'player_id': player_id,
                    'match_id': match_id,
                    'text': match_text
                })
            
            # Generate embeddings for batch with proper pooling
            with torch.no_grad(), torch.cuda.amp.autocast(enabled=True):
                # If using the model's encode method (which handles pooling internally)
                try:
                    embeddings = model.encode(batch_texts, max_length=max_seq_length)
                except AttributeError:
                    # If encode isn't available, manually handle the forward pass and pooling
                    inputs = model.tokenizer(batch_texts, 
                                          padding=True, 
                                          truncation=True, 
                                          max_length=max_seq_length, 
                                          return_tensors="pt").to(device)
                    outputs = model(**inputs)
                    embeddings = mean_pooling(outputs).cpu().numpy()
                else:
                    if isinstance(embeddings, torch.Tensor):
                        embeddings = embeddings.cpu().numpy()
            
            # Clear CUDA cache after each batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            # Add results
            for info, embedding in zip(batch_info, embeddings):
                info['embedding'] = embedding
                results.append(info)
            
            # Explicitly delete batch data
            del embeddings
            del batch_texts
            del batch_info
    
    finally:
        # Clean up model resources
        model.cpu()
        del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    df = pd.DataFrame(results)
    
    return df

def save_embeddings(df, output_path='player_match_embeddings.parquet'):
    """Save the DataFrame with embeddings efficiently."""
    df['embedding'] = df['embedding'].apply(lambda x: x.tolist())
    # Exclude 'match_count' from being saved to the DataFrame or Parquet
    df = df.drop(columns=['match_count'], errors='ignore')
    df.to_parquet(output_path)
    print(f"Embeddings saved to {output_path}")

def convert_to_player_embeddings(df):
    """
    Convert the player_match embeddings DataFrame to a player-level embeddings DataFrame by averaging embeddings per player.
    """
    # Convert embeddings from list to numpy array for easier handling
    df['embedding'] = df['embedding'].apply(np.array)
    
    # Group by 'player_name' and average the embeddings for each player
    player_embeddings_df = df.groupby('player_id').agg(
    player_name=('player_name', 'first'),
    embedding=('embedding', 'mean')
).reset_index()

    
    return player_embeddings_df

if __name__ == "__main__":
    import time
    
    start_time = time.time()
    
    # Read and embed player matches
    embeddings_df = read_and_embed_player_matches(
        'event_descriptions',
        batch_size=1,
        max_seq_length=8192
    )
    
    # Convert to player embeddings (averaging across matches)
    player_embeddings_df = convert_to_player_embeddings(embeddings_df)
    
    # Save player embeddings
    save_embeddings(player_embeddings_df, 'player_embeddings.parquet')
    
    print(f"Processing completed in {time.time() - start_time:.2f} seconds")
    print(f"Total players: {player_embeddings_df['player_name'].nunique()}")
    print(f"Total player embeddings: {len(player_embeddings_df)}")


Using GPU: NVIDIA L4
Processing 9233 files from 376 qualified players


Processing batches: 100%|██████████| 9233/9233 [13:50<00:00, 11.11it/s]


Embeddings saved to player_embeddings.parquet
Processing completed in 834.44 seconds
Total players: 376
Total player embeddings: 376


In [13]:
import pandas as pd

# Load the DataFrame from the saved Parquet file
embeddings_df = pd.read_parquet('player_embeddings.parquet')

# Display the first few rows to check the data
print(embeddings_df.tail())


    player_id       player_name  \
430    9641.0  Fraizer Campbell   
431    9642.0       Alan Hutton   
432    9649.0      Jack Colback   
433    9930.0      Ryan Bennett   
434    9958.0    Steven Caulker   

                                             embedding  
430  [-0.28325140476226807, -0.26176461577415466, 0...  
431  [-0.21182601153850555, -0.21284833550453186, 0...  
432  [-0.21501609683036804, -0.21565166115760803, 0...  
433  [-0.19289526343345642, -0.2248104214668274, 0....  
434  [-0.2230857014656067, -0.24024677276611328, 0....  


In [3]:
import pandas as pd
from collections import Counter

def add_player_metadata_from_events(embeddings_file, events_df, output_file):
    """
    Load embeddings, add position and team data from events DataFrame, and save to a new file.
    
    Parameters:
    embeddings_file (str): Path to existing player embeddings parquet file
    events_df (pd.DataFrame): DataFrame containing events data with player, team, and position info
    output_file (str): Path to save the new embeddings with metadata
    """
    # Load existing embeddings
    print("Loading existing embeddings...")
    embeddings_df = pd.read_parquet(embeddings_file)
    
    # Get position and team from events data
    # Group by player and get the most frequent position and team
    player_data = events_df.groupby('player').agg({
        'position': lambda x: Counter(x).most_common(1)[0][0],
        'team': lambda x: Counter(x).most_common(1)[0][0]
    }).reset_index()
    
    # Rename columns to match our existing data
    player_data = player_data.rename(columns={'player': 'player_name'})
    
    # Merge position and team data with embeddings
    # Using left merge to keep all players from embeddings
    embeddings_df = embeddings_df.merge(player_data, on='player_name', how='left')
    
    # Save the updated DataFrame
    print("Saving updated embeddings...")
    embeddings_df.to_parquet(output_file)
    
    # Print statistics
    print("\nDataset statistics:")
    print(f"Total players: {len(embeddings_df)}")
    print(f"Players with position data: {embeddings_df['position'].notna().sum()}")
    print(f"Players with team data: {embeddings_df['team'].notna().sum()}")
    
    print("\nPosition distribution:")
    print(embeddings_df['position'].value_counts())
    
    print("\nTeam distribution:")
    print(embeddings_df['team'].value_counts())
    
    # Print players missing metadata
    missing_metadata = embeddings_df[embeddings_df['position'].isna() | embeddings_df['team'].isna()]
    if len(missing_metadata) > 0:
        print("\nPlayers missing metadata:")
        print(missing_metadata['player_name'].tolist())
    
    return embeddings_df

# Example usage:

events = sb.competition_events(
    country="England",
    division= "Premier League",
    season="2015/2016",
    gender="male"
)

# Assuming you have your events DataFrame loaded
updated_df = add_player_metadata_from_events(
    embeddings_file='player_embeddings.parquet',
    events_df=events,  # Your events DataFrame
    output_file='player_embeddings_with_metadata.parquet'
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Loading existing embeddings...
Saving updated embeddings...

Dataset statistics:
Total players: 376
Players with position data: 376
Players with team data: 376

Position distribution:
position
Center Forward               50
Left Back                    36
Right Back                   35
Right Center Back            34
Left Center Back             31
Left Wing                    28
Right Wing                   26
Center Attacking Midfield    24
Left Defensive Midfield      22
Right Center Midfield        20
Right Defensive Midfield     18
Left Center Midfield         15
Left Midfield                12
Right Midfield               10
Center Defensive Midfield     7
Right Center Forward          5
Left Center Forward           3
Name: count, dtype: int64

Team distribution:
team
Sunderland              22
Norwich City            20
Aston Villa             20
Manchester City         20
West Ham United         20
Arsenal                 20
Chelsea                 20
Stoke City             

In [4]:
print(updated_df)

    player_id                         player_name  \
0     10457.0                     Patrick Bamford   
1     10499.0  Ivo Daniel Ferreira Mendonca Pinto   
2     10503.0                         Billy Jones   
3     10779.0           Cameron Borthwick-Jackson   
4     10783.0                      Lee Cattermole   
..        ...                                 ...   
371    9638.0                       Jack Grealish   
372    9641.0                    Fraizer Campbell   
373    9642.0                         Alan Hutton   
374    9649.0                        Jack Colback   
375    9930.0                        Ryan Bennett   

                                             embedding  \
0    [-0.25265589356422424, -0.264048308134079, 0.1...   
1    [-0.21007807552814484, -0.22289517521858215, 0...   
2    [-0.20582975447177887, -0.21635812520980835, 0...   
3    [-0.20873406529426575, -0.20124152302742004, 0...   
4    [-0.20809024572372437, -0.2177758365869522, 0....   
..             