In [1]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS
import altair as alt

In [2]:
team_files = [
    "Augsburg.csv",
    "Bayern_Munich.csv",
    "Bayer_Leverkusen.csv",
    "Bochum.csv",
    "Borussia_Dortmund.csv",
    "Borussia_Mönchengladbach.csv",
    "Eintracht_Frankfurt.csv",
    "FC_Heidenheim.csv",
    "Freiburg.csv",
    "FSV_Mainz_05.csv",
    "Hoffenheim.csv",
    "Holstein_Kiel.csv",
    "RB_Leipzig.csv",
    "St_Pauli.csv",
    "Union_Berlin.csv",
    "VfB_Stuttgart.csv",
    "Werder_Bremen.csv",
    "Wolfsburg.csv",
]

In [3]:
def analyze_team_shots(file_path):
    df = pd.read_csv(file_path)
    
    total_seqs = len(df)
    shot_mask = df['words'].str.contains('Shot')
    shot_df = df[shot_mask].copy()
    num_shots = len(shot_df)
    
    if num_shots == 0:
        return {
            'Team': df['team_name'].iloc[0] if not df.empty else file_path,
            'Total_Sequences': total_seqs,
            'Shot_Count': 0,
            'Shot_Rate_%': 0,
            'Avg_Shot_Length': 0,
            'Signature_Shot_Sequence': "N/A",
            'Avg_K_Shot': 0,
            'Avg_Beta_Shot': 0
        }

    # Calcoliamo la lunghezza di ogni sequenza
    shot_df['len'] = shot_df['words'].apply(lambda x: len(x.split('-')))
    
    # --- MODIFICA PER LA SIGNATURE ---
    # Filtriamo le sequenze per trovare la "vera" firma tattica:
    # Cerchiamo la sequenza più comune che abbia almeno 3 azioni (es. Pass-Carry-Shot)
    elaborated_shots = shot_df[shot_df['len'] >= 3]
    
    if not elaborated_shots.empty:
        signature = elaborated_shots['words'].value_counts().idxmax()
    else:
        # Se la squadra non ha mai sequenze da 3+ azioni (raro), prendiamo la più frequente in assoluto
        signature = shot_df['words'].value_counts().idxmax()
    # ---------------------------------
    
    return {
        'Team': df['team_name'].iloc[0],
        'Total_Sequences': total_seqs,
        'Shot_Count': num_shots,
        'Shot_Rate_%': (num_shots / total_seqs) * 100,
        'Avg_Shot_Length': shot_df['len'].mean(),
        'Signature_Shot_Sequence': signature,
        'Avg_K_Shot': shot_df['K'].mean(),
        'Avg_Beta_Shot': shot_df['beta'].mean()
    }

# Il resto del codice rimane invariato
available_files = [f for f in team_files if os.path.exists(f)]
results = []
for f in available_files:
    results.append(analyze_team_shots(f))

summary_df = pd.DataFrame(results)

In [4]:
# 1. Data collection from all available files
results = []
for file in team_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        # Filter sequences that lead to a shot
        shot_df = df[df['words'].str.contains('Shot')].copy()
        
        if not shot_df.empty:
            results.append({
                'Team': df['team_name'].iloc[0],
                'Shot_Rate_%': (len(shot_df) / len(df)) * 100,
                'Avg_K_Shot': shot_df['K'].mean()
            })

df_plot = pd.DataFrame(results)

# 2. Creation of the Interactive Scatter Plot
# Define the bubble chart
chart = alt.Chart(df_plot).mark_circle(size=250, opacity=0.8).encode(
    x=alt.X(
        'Shot_Rate_%',
        title='Efficiency (Shot Rate %)',
        scale=alt.Scale(zero=False)
    ),
    y=alt.Y(
        'Avg_K_Shot',
        title='Tactical Complexity (Avg K)',
        scale=alt.Scale(zero=False)
    ),
    color=alt.Color('Team', legend=None),  # Distinct colors, legend hidden for clarity
    tooltip=[
        alt.Tooltip('Team', title='Team'),
        alt.Tooltip('Shot_Rate_%', title='Efficiency %', format='.2f'),
        alt.Tooltip('Avg_K_Shot', title='Complexity (K)', format='.4f')
    ]
).properties(
    title="Offensive Language Map: Efficiency vs Complexity",
    width=800,
    height=500
)

# 3. Add labels (team names)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=12,
    fontSize=11
).encode(
    text='Team'
)

# 4. Mean lines (to identify the four quadrants)
line_x = alt.Chart(
    pd.DataFrame({'x': [df_plot['Shot_Rate_%'].mean()]})
).mark_rule(
    color='red',
    strokeDash=[3, 3]
).encode(x='x')

line_y = alt.Chart(
    pd.DataFrame({'y': [df_plot['Avg_K_Shot'].mean()]})
).mark_rule(
    color='blue',
    strokeDash=[3, 3]
).encode(y='y')

# 5. Final interactive visualization
final_chart = (chart + text + line_x + line_y).interactive()
final_chart.display()

In [5]:
# 1. Data collection from all available files
results = []
for file in team_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        # Filter sequences that lead to a shot
        shot_df = df[df['words'].str.contains('Shot')].copy()
        
        if not shot_df.empty:
            # Calculate sequence length (number of actions) for each shot sequence
            shot_df['len'] = shot_df['words'].apply(lambda x: len(x.split('-')))
            
            results.append({
                'Team': df['team_name'].iloc[0],
                'Avg_Shot_Length': shot_df['len'].mean(),
                'Avg_K_Shot': shot_df['K'].mean()
            })

df_plot = pd.DataFrame(results)

# 2. Interactive Scatter Plot Creation with Altair
chart = alt.Chart(df_plot).mark_circle(size=250, opacity=0.8).encode(
    x=alt.X('Avg_Shot_Length', 
            title='Average Shot Sequence Length (Avg_Shot_Length)', 
            scale=alt.Scale(zero=False)),
    y=alt.Y('Avg_K_Shot', 
            title='Tactical Complexity (Avg_K_Shot)', 
            scale=alt.Scale(zero=False)),
    color=alt.Color('Team', legend=None), # Different color for each team
    tooltip=[
        alt.Tooltip('Team', title='Team'),
        alt.Tooltip('Avg_Shot_Length', title='Avg Length', format='.2f'),
        alt.Tooltip('Avg_K_Shot', title='Complexity (K)', format='.4f')
    ]
).properties(
    title="Offensive Language Analysis: Build-up vs. Variety",
    width=800,
    height=500
)

# 3. Adding team names next to the data points
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=12,
    fontSize=11
).encode(
    text='Team'
)

# 4. Adding average reference lines (Quadrants)
line_x = alt.Chart(pd.DataFrame({'x': [df_plot['Avg_Shot_Length'].mean()]})).mark_rule(
    color='red', strokeDash=[3, 3], size=1
).encode(x='x')

line_y = alt.Chart(pd.DataFrame({'y': [df_plot['Avg_K_Shot'].mean()]})).mark_rule(
    color='blue', strokeDash=[3, 3], size=1
).encode(y='y')

# 5. Final interactive visualization (Zoom + Tooltip)
final_chart = (chart + text + line_x + line_y).interactive()
final_chart.display()