In [None]:
# ================================================================================================
# CELL 1: Imports and Setup
# ================================================================================================

import numpy as np
import pandas as pd
import math
import ast
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit, when, isnotnull, isnull, regexp_extract, split, round as spark_round, sqrt, pow as spark_pow, udf, regexp_replace
from pyspark.sql.types import FloatType, IntegerType, StringType, StructType, StructField, ArrayType
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import seaborn as sns

✅ Spark Session initialized successfully!
Spark Version: 4.0.0


In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PreprocessingDemo") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print("✅ Spark Session initialized successfully!")
print(f"Spark Version: {spark.version}")

In [None]:
# ================================================================================================
# CELL 2: Constants and Configuration (Simulating xG_constants.py)
# ================================================================================================

# Simulating the constants from xG_constants.py
SEASON = "2023-24"
EVENTS = [
    'id', 'type', 'season', 'location', 'player_id', 'team_id',
    'shot_end_location', 'shot_body_part', 'shot_type', 'shot_outcome',
    'shot_freeze_frame', 'shot_statsbomb_xg', 'shot_one_on_one',
    'under_pressure', 'pass_assisted_shot_id', 'pass_body_part',
    'pass_height', 'pass_angle', 'pass_length', 'pass_end_location'
]

DUMMIES = {
    'shot_type': {
        'Open Play': 'open_play',
        'Free Kick': 'free_kick',
        'Corner': 'corner',
        'Penalty': 'pk_type'
    },
    'shot_outcome': {
        'Goal': 'goal_outcome',
        'Saved': 'saved',
        'Blocked': 'blocked',
        'Off T': 'off_target'
    }
}

BOOL_TO_INT = ['under_pressure', 'shot_one_on_one', 'preferred_foot_shot']

VARIABLES = [
    'id', 'goal', 'distance_to_goal', 'shot_angle', 'header', 'preferred_foot_shot',
    'players_inside_area', 'assisted', 'pass_height', 'pass_angle', 'pass_length',
    'open_play', 'free_kick', 'corner', 'pk_type', 'under_pressure', 'shot_one_on_one',
    'sb_prediction'
]

FEATURES = [
    'distance_to_goal', 'shot_angle', 'header', 'preferred_foot_shot',
    'players_inside_area', 'assisted', 'pass_height', 'pass_angle', 'pass_length',
    'open_play', 'free_kick', 'corner', 'pk_type', 'under_pressure', 'shot_one_on_one'
]

print("✅ Constants defined successfully!")
print(f"Total events to track: {len(EVENTS)}")
print(f"Features for ML model: {len(FEATURES)}")

✅ Constants defined successfully!
Total events to track: 20
Features for ML model: 15


In [4]:
# ================================================================================================
# CELL 3: Sample Data Generation
# ================================================================================================

# Create sample data that mimics real football event data
sample_data = [
    # Shot 1: Open play goal
    {
        'id': 'shot_001',
        'type': 'Shot',
        'season': '2023-24',
        'location': '[30.0, 40.0]',
        'player_id': 'player_001',
        'team_id': 'team_A',
        'shot_end_location': '[120.0, 40.0]',
        'shot_body_part': 'Right Foot',
        'shot_type': 'Open Play',
        'shot_outcome': 'Goal',
        'shot_freeze_frame': "{'location': [25.0, 35.0]}, {'position': 'Center Back', 'teammate': True}, {'location': [28.0, 42.0]}, {'position': 'Right Back', 'teammate': False}",
        'shot_statsbomb_xg': 0.75,
        'shot_one_on_one': False,
        'under_pressure': True,
        'pass_assisted_shot_id': None,
        'pass_body_part': None,
        'pass_height': None,
        'pass_angle': None,
        'pass_length': None,
        'pass_end_location': None
    },
    # Shot 2: Penalty
    {
        'id': 'shot_002',
        'type': 'Shot',
        'season': '2023-24',
        'location': '[108.0, 40.0]',
        'player_id': 'player_002',
        'team_id': 'team_B',
        'shot_end_location': '[120.0, 36.0]',
        'shot_body_part': 'Left Foot',
        'shot_type': 'Penalty',
        'shot_outcome': 'Saved',
        'shot_freeze_frame': None,
        'shot_statsbomb_xg': 0.8,
        'shot_one_on_one': True,
        'under_pressure': False,
        'pass_assisted_shot_id': None,
        'pass_body_part': None,
        'pass_height': None,
        'pass_angle': None,
        'pass_length': None,
        'pass_end_location': None
    },
    # Shot 3: Header from corner
    {
        'id': 'shot_003',
        'type': 'Shot',
        'season': '2023-24',
        'location': '[114.0, 42.0]',
        'player_id': 'player_003',
        'team_id': 'team_A',
        'shot_end_location': '[120.0, 38.0]',
        'shot_body_part': 'Head',
        'shot_type': 'Corner',
        'shot_outcome': 'Off T',
        'shot_freeze_frame': "{'location': [110.0, 38.0]}, {'position': 'Center Back', 'teammate': False}, {'location': [116.0, 44.0]}, {'position': 'Right Back', 'teammate': False}",
        'shot_statsbomb_xg': 0.25,
        'shot_one_on_one': False,
        'under_pressure': True,
        'pass_assisted_shot_id': 'pass_001',
        'pass_body_part': None,
        'pass_height': None,
        'pass_angle': None,
        'pass_length': None,
        'pass_end_location': None
    },
    # Pass 1: Assist for shot 3
    {
        'id': 'pass_001',
        'type': 'Pass',
        'season': '2023-24',
        'location': '[105.0, 2.0]',
        'player_id': 'player_004',
        'team_id': 'team_A',
        'shot_end_location': None,
        'shot_body_part': None,
        'shot_type': None,
        'shot_outcome': None,
        'shot_freeze_frame': None,
        'shot_statsbomb_xg': None,
        'shot_one_on_one': None,
        'under_pressure': True,
        'pass_assisted_shot_id': 'shot_003',
        'pass_body_part': 'Right Foot',
        'pass_height': 'High Pass',
        'pass_angle': 1.2,
        'pass_length': 15.8,
        'pass_end_location': '[114.0, 42.0]'
    }
]

# Create DataFrame
df = spark.createDataFrame(sample_data)

print("✅ Sample data created successfully!")
print(f"Total records: {df.count()}")
print("\n📊 Data Preview:")
df.show(truncate=False)

✅ Sample data created successfully!


                                                                                

Total records: 4

📊 Data Preview:
+--------+-------------+----------+---------------------+--------------+-----------------+-----------+-----------+----------+-------+--------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+------------+-----------------+---------+-------+----+--------------+
|id      |location     |pass_angle|pass_assisted_shot_id|pass_body_part|pass_end_location|pass_height|pass_length|player_id |season |shot_body_part|shot_end_location|shot_freeze_frame                                                                                                                                      |shot_one_on_one|shot_outcome|shot_statsbomb_xg|shot_type|team_id|type|under_pressure|
+--------+-------------+----------+---------------------+--------------+-----------------+-----------+-----------+----------+-------+--------------+----------

In [5]:
# ================================================================================================
# CELL 4: Static Method Demonstrations
# ================================================================================================

class PreprocessingDemo:
    """Demo version of the Preprocessing class with detailed explanations"""
    
    @staticmethod
    def shot_angle(shot_x, shot_y, GOAL_X=120, GOAL_Y1=36, GOAL_Y2=44):
        """
        Calculate the angle between vectors from shot to each goal post
        """
        print(f"  🎯 Calculating shot angle for position ({shot_x}, {shot_y})")
        print(f"  📍 Goal posts at ({GOAL_X}, {GOAL_Y1}) and ({GOAL_X}, {GOAL_Y2})")
        
        # Vectors from shot to each goal post
        u_x = GOAL_X - shot_x
        u_y = GOAL_Y1 - shot_y
        v_y = GOAL_Y2 - shot_y
        
        print(f"  📐 Vector to lower post: ({u_x}, {u_y})")
        print(f"  📐 Vector to upper post: ({u_x}, {v_y})")
        
        # Calculate dot product and magnitudes
        dot_product = u_x ** 2 + u_y * v_y
        magnitude_u = math.sqrt(u_x ** 2 + u_y ** 2)
        magnitude_v = math.sqrt(u_x ** 2 + v_y ** 2)
        
        print(f"  🔢 Dot product: {dot_product:.2f}")
        print(f"  🔢 Magnitudes: {magnitude_u:.2f}, {magnitude_v:.2f}")
        
        if magnitude_u == 0 or magnitude_v == 0:
            print("  ⚠️  Zero magnitude detected, returning 0")
            return 0.0
        
        # Calculate angle
        angle_radians = math.acos(dot_product / (magnitude_u * magnitude_v))
        angle_degrees = math.degrees(angle_radians)
        
        print(f"  📊 Final angle: {angle_degrees:.2f}°")
        return angle_degrees

# Test the shot angle calculation
print("🎯 TESTING SHOT ANGLE CALCULATION")
print("=" * 50)

test_positions = [
    (30.0, 40.0),   # Far shot
    (114.0, 42.0),  # Close shot
    (108.0, 40.0)   # Penalty position
]

for pos in test_positions:
    angle = PreprocessingDemo.shot_angle(pos[0], pos[1])
    print(f"Position {pos} → Angle: {angle:.2f}°\n")

🎯 TESTING SHOT ANGLE CALCULATION
  🎯 Calculating shot angle for position (30.0, 40.0)
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (90.0, -4.0)
  📐 Vector to upper post: (90.0, 4.0)
  🔢 Dot product: 8084.00
  🔢 Magnitudes: 90.09, 90.09
  📊 Final angle: 5.09°
Position (30.0, 40.0) → Angle: 5.09°

  🎯 Calculating shot angle for position (114.0, 42.0)
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (6.0, -6.0)
  📐 Vector to upper post: (6.0, 2.0)
  🔢 Dot product: 24.00
  🔢 Magnitudes: 8.49, 6.32
  📊 Final angle: 63.43°
Position (114.0, 42.0) → Angle: 63.43°

  🎯 Calculating shot angle for position (108.0, 40.0)
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (12.0, -4.0)
  📐 Vector to upper post: (12.0, 4.0)
  🔢 Dot product: 128.00
  🔢 Magnitudes: 12.65, 12.65
  📊 Final angle: 36.87°
Position (108.0, 40.0) → Angle: 36.87°



In [6]:
# ================================================================================================
# CELL 5: Location Splitting Demonstration
# ================================================================================================

def demonstrate_split_location(df):
    """Show how location strings are split into X,Y coordinates"""
    print("📍 LOCATION SPLITTING DEMONSTRATION")
    print("=" * 50)
    
    print("🔍 Original location format:")
    df.select("id", "location").show(truncate=False)
    
    print("\n🔧 Extracting X coordinates...")
    df_with_x = df.withColumn("shot_location_x",
                             regexp_extract(col("location"), r'\[(.*?),', 1).cast("float"))
    
    print("🔧 Extracting Y coordinates...")
    df_with_xy = df_with_x.withColumn("shot_location_y",
                                     regexp_extract(col("location"), r', (.*?)\]', 1).cast("float"))
    
    print("\n✅ Result:")
    df_with_xy.select("id", "location", "shot_location_x", "shot_location_y").show()
    
    # Also handle shot_end_location if present
    print("\n🔧 Processing shot end locations...")
    df_final = df_with_xy.withColumn("shot_end_location_clean",
                                    regexp_replace(col("shot_end_location"), "[\\[\\]]", "")) \
                         .withColumn("shot_end_x",
                                    split(col("shot_end_location_clean"), ",")[0].cast("double")) \
                         .withColumn("shot_end_y",
                                    split(col("shot_end_location_clean"), ",")[1].cast("double")) \
                         .drop("shot_end_location_clean", "location")
    
    print("✅ Final result with end locations:")
    df_final.select("id", "shot_location_x", "shot_location_y", "shot_end_x", "shot_end_y").show()
    
    return df_final

# Run the demonstration
df_with_coords = demonstrate_split_location(df)

📍 LOCATION SPLITTING DEMONSTRATION
🔍 Original location format:
+--------+-------------+
|id      |location     |
+--------+-------------+
|shot_001|[30.0, 40.0] |
|shot_002|[108.0, 40.0]|
|shot_003|[114.0, 42.0]|
|pass_001|[105.0, 2.0] |
+--------+-------------+


🔧 Extracting X coordinates...
🔧 Extracting Y coordinates...

✅ Result:
+--------+-------------+---------------+---------------+
|      id|     location|shot_location_x|shot_location_y|
+--------+-------------+---------------+---------------+
|shot_001| [30.0, 40.0]|           30.0|           40.0|
|shot_002|[108.0, 40.0]|          108.0|           40.0|
|shot_003|[114.0, 42.0]|          114.0|           42.0|
|pass_001| [105.0, 2.0]|          105.0|            2.0|
+--------+-------------+---------------+---------------+


🔧 Processing shot end locations...
✅ Final result with end locations:
+--------+---------------+---------------+----------+----------+
|      id|shot_location_x|shot_location_y|shot_end_x|shot_end_y|
+-----

In [7]:
# ================================================================================================
# CELL 6: Distance to Goal Calculation
# ================================================================================================

def demonstrate_distance_calculation(df):
    """Show how distance to goal is calculated"""
    print("📏 DISTANCE TO GOAL CALCULATION")
    print("=" * 50)
    
    GOAL_X, GOAL_Y1, GOAL_Y2 = 120, 36, 44
    goal_center_y = (GOAL_Y1 + GOAL_Y2) / 2
    
    print(f"🥅 Goal center position: ({GOAL_X}, {goal_center_y})")
    
    # Calculate distance using Euclidean formula
    df_with_distance = df.withColumn("distance_to_goal",
                                    spark_round(sqrt(
                                        spark_pow(col("shot_location_x") - lit(GOAL_X), 2) +
                                        spark_pow(col("shot_location_y") - lit(goal_center_y), 2)
                                    ), 2))
    
    print("\n📊 Distance calculations:")
    result = df_with_distance.select("id", "shot_location_x", "shot_location_y", "distance_to_goal").collect()
    
    for row in result:
        if row.shot_location_x is not None:
            print(f"  Shot {row.id}: Position ({row.shot_location_x}, {row.shot_location_y}) → Distance: {row.distance_to_goal}m")
    
    return df_with_distance

# Run the demonstration
df_with_distance = demonstrate_distance_calculation(df_with_coords)

📏 DISTANCE TO GOAL CALCULATION
🥅 Goal center position: (120, 40.0)

📊 Distance calculations:
  Shot shot_001: Position (30.0, 40.0) → Distance: 90.0m
  Shot shot_002: Position (108.0, 40.0) → Distance: 12.0m
  Shot shot_003: Position (114.0, 42.0) → Distance: 6.32m
  Shot pass_001: Position (105.0, 2.0) → Distance: 40.85m


In [8]:

# ================================================================================================
# CELL 7: Preferred Foot Analysis
# ================================================================================================

def demonstrate_preferred_foot_analysis(df):
    """Show how preferred foot is determined"""
    print("🦶 PREFERRED FOOT ANALYSIS")
    print("=" * 50)
    
    print("📊 Analyzing body parts used in passes and shots...")
    
    # Extract pass body parts
    pass_bp = df.filter(col('type') == 'Pass') \
                .select('player_id', col('pass_body_part').alias('body_part')) \
                .filter(col('body_part').isin('Right Foot', 'Left Foot'))
    
    print("Pass body parts:")
    pass_bp.show()
    
    # Extract shot body parts
    shot_bp = df.filter(col('type') == 'Shot') \
                .select('player_id', col('shot_body_part').alias('body_part')) \
                .filter(col('body_part').isin('Right Foot', 'Left Foot'))
    
    print("Shot body parts:")
    shot_bp.show()
    
    # Combine datasets
    bp = pass_bp.union(shot_bp)
    
    print("Combined body part usage:")
    bp.show()
    
    # Convert to numerical indicators
    bp_mapped = bp.withColumn('left_foot', (col('body_part') == 'Left Foot').cast('int')) \
                  .withColumn('right_foot', (col('body_part') == 'Right Foot').cast('int')) \
                  .drop('body_part')
    
    print("Numerical indicators:")
    bp_mapped.show()
    
    # Aggregate by player
    foot_counts = bp_mapped.groupBy('player_id') \
                          .sum('left_foot', 'right_foot') \
                          .withColumnRenamed('sum(left_foot)', 'left_foot') \
                          .withColumnRenamed('sum(right_foot)', 'right_foot')
    
    foot_counts = foot_counts.withColumn("total_actions", col("left_foot") + col("right_foot"))
    
    print("Foot usage counts:")
    foot_counts.show()
    
    # Determine preferred foot
    foot_counts = foot_counts.withColumn("preferred_foot",
                                        when((col("left_foot") / col("total_actions")) >= 0.66, "Left Foot")
                                        .when((col("right_foot") / col("total_actions")) >= 0.66, "Right Foot")
                                        .otherwise("Two-Footed"))
    
    print("✅ Preferred foot determination:")
    foot_counts.select("player_id", "preferred_foot").show()
    
    return foot_counts.select("player_id", "preferred_foot")

# Run the demonstration
preferred_foot_df = demonstrate_preferred_foot_analysis(df_with_distance)

🦶 PREFERRED FOOT ANALYSIS
📊 Analyzing body parts used in passes and shots...
Pass body parts:
+----------+----------+
| player_id| body_part|
+----------+----------+
|player_004|Right Foot|
+----------+----------+

Shot body parts:
+----------+----------+
| player_id| body_part|
+----------+----------+
|player_001|Right Foot|
|player_002| Left Foot|
+----------+----------+

Combined body part usage:
+----------+----------+
| player_id| body_part|
+----------+----------+
|player_004|Right Foot|
|player_001|Right Foot|
|player_002| Left Foot|
+----------+----------+

Numerical indicators:
+----------+---------+----------+
| player_id|left_foot|right_foot|
+----------+---------+----------+
|player_004|        0|         1|
|player_001|        0|         1|
|player_002|        1|         0|
+----------+---------+----------+

Foot usage counts:
+----------+---------+----------+-------------+
| player_id|left_foot|right_foot|total_actions|
+----------+---------+----------+-------------+
|pla

In [9]:

# ================================================================================================
# CELL 8: Freeze Frame Data Processing
# ================================================================================================

def demonstrate_freeze_frame_processing(df):
    """Show how freeze frame data is processed"""
    print("🎬 FREEZE FRAME DATA PROCESSING")
    print("=" * 50)
    
    # Filter for shots with freeze frame data
    shots_with_frames = df.filter(col('shot_type') != 'Penalty') \
                         .select('id', 'shot_freeze_frame') \
                         .filter(col('shot_freeze_frame').isNotNull())
    
    print("Shots with freeze frame data:")
    shots_with_frames.show(truncate=False)
    
    # Process freeze frame data (simplified version)
    print("\n🔧 Processing freeze frame data...")
    
    processed_frames = []
    for row in shots_with_frames.collect():
        shot_id = row.id
        frame_data = row.shot_freeze_frame
        
        print(f"\n📊 Processing shot {shot_id}:")
        print(f"Raw data: {frame_data}")
        
        # Simplified parsing (in real implementation, this would be more robust)
        if frame_data:
            # Extract coordinate pairs (simplified)
            import re
            coordinates = re.findall(r'\[([\d.]+), ([\d.]+)\]', frame_data)
            
            print(f"Extracted coordinates: {coordinates}")
            
            for i, (x, y) in enumerate(coordinates):
                processed_frames.append({
                    'Shot_id': shot_id,
                    'player_num': i,
                    'x': float(x),
                    'y': float(y),
                    'position': 'Unknown',  # Simplified
                    'teammate': i % 2 == 0  # Simplified alternating pattern
                })
    
    # Create DataFrame from processed data
    if processed_frames:
        frame_df = spark.createDataFrame(processed_frames)
        print("\n✅ Processed freeze frame data:")
        frame_df.show()
        return frame_df
    else:
        print("⚠️  No freeze frame data to process")
        return None

# Run the demonstration
freeze_frame_df = demonstrate_freeze_frame_processing(df_with_distance)

🎬 FREEZE FRAME DATA PROCESSING
Shots with freeze frame data:
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|id      |shot_freeze_frame                                                                                                                                      |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|shot_001|{'location': [25.0, 35.0]}, {'position': 'Center Back', 'teammate': True}, {'location': [28.0, 42.0]}, {'position': 'Right Back', 'teammate': False}   |
|shot_003|{'location': [110.0, 38.0]}, {'position': 'Center Back', 'teammate': False}, {'location': [116.0, 44.0]}, {'position': 'Right Back', 'teammate': False}|
+--------+------------------------------------------------------------------------------------------------------------------

In [10]:
# ================================================================================================
# CELL 9: Dummy Variable Creation
# ================================================================================================

def demonstrate_dummy_creation(df):
    """Show how categorical variables are converted to dummy variables"""
    print("🎛️ DUMMY VARIABLE CREATION")
    print("=" * 50)
    
    print("📊 Original categorical data:")
    df.select("id", "shot_type", "shot_outcome").show()
    
    # Create dummy variables for shot_type
    print("\n🔧 Creating dummy variables for shot_type...")
    df_with_dummies = df
    
    for value, dummy_col in DUMMIES['shot_type'].items():
        df_with_dummies = df_with_dummies.withColumn(dummy_col,
                                                    when(col('shot_type') == value, 1).otherwise(0))
        print(f"  ✅ Created {dummy_col} for {value}")
    
    # Create dummy variables for shot_outcome
    print("\n🔧 Creating dummy variables for shot_outcome...")
    for value, dummy_col in DUMMIES['shot_outcome'].items():
        df_with_dummies = df_with_dummies.withColumn(dummy_col,
                                                    when(col('shot_outcome') == value, 1).otherwise(0))
        print(f"  ✅ Created {dummy_col} for {value}")
    
    # Create special columns
    df_with_dummies = df_with_dummies.withColumn('goal',
                                                when(col('shot_outcome') == 'Goal', 1).otherwise(0)) \
                                    .withColumn('header',
                                                when(col('shot_body_part') == 'Head', 1).otherwise(0))
    
    print("\n✅ Result with dummy variables:")
    dummy_cols = ['id', 'open_play', 'free_kick', 'corner', 'pk_type', 'goal', 'header']
    df_with_dummies.select(dummy_cols).show()
    
    return df_with_dummies

# Run the demonstration
df_with_dummies = demonstrate_dummy_creation(df_with_distance)

🎛️ DUMMY VARIABLE CREATION
📊 Original categorical data:
+--------+---------+------------+
|      id|shot_type|shot_outcome|
+--------+---------+------------+
|shot_001|Open Play|        Goal|
|shot_002|  Penalty|       Saved|
|shot_003|   Corner|       Off T|
|pass_001|     NULL|        NULL|
+--------+---------+------------+


🔧 Creating dummy variables for shot_type...
  ✅ Created open_play for Open Play
  ✅ Created free_kick for Free Kick
  ✅ Created corner for Corner
  ✅ Created pk_type for Penalty

🔧 Creating dummy variables for shot_outcome...
  ✅ Created goal_outcome for Goal
  ✅ Created saved for Saved
  ✅ Created blocked for Blocked
  ✅ Created off_target for Off T

✅ Result with dummy variables:
+--------+---------+---------+------+-------+----+------+
|      id|open_play|free_kick|corner|pk_type|goal|header|
+--------+---------+---------+------+-------+----+------+
|shot_001|        1|        0|     0|      0|   1|     0|
|shot_002|        0|        0|     0|      1|   0|   

In [11]:
# ================================================================================================
# CELL 10: Boolean to Integer Conversion
# ================================================================================================

def demonstrate_bool_to_int(df):
    """Show how boolean columns are converted to integers"""
    print("🔄 BOOLEAN TO INTEGER CONVERSION")
    print("=" * 50)
    
    print("📊 Original boolean data:")
    df.select("id", "under_pressure", "shot_one_on_one").show()
    
    print("\n🔧 Converting boolean columns to integers...")
    df_converted = df
    
    for col_name in ['under_pressure', 'shot_one_on_one']:
        df_converted = df_converted.withColumn(col_name,
                                              when(col(col_name).isNull(), 0)
                                              .otherwise(col(col_name).cast('int')))
        print(f"  ✅ Converted {col_name} to integer")
    
    # Handle special case for shot_one_on_one vs pk_type
    df_converted = df_converted.withColumn('shot_one_on_one',
                                          when(col('pk_type') == 1, 1)
                                          .otherwise(col('shot_one_on_one')))
    
    print("\n✅ Result after conversion:")
    df_converted.select("id", "under_pressure", "shot_one_on_one", "pk_type").show()
    
    return df_converted

# Run the demonstration
df_final = demonstrate_bool_to_int(df_with_dummies)

🔄 BOOLEAN TO INTEGER CONVERSION
📊 Original boolean data:
+--------+--------------+---------------+
|      id|under_pressure|shot_one_on_one|
+--------+--------------+---------------+
|shot_001|          true|          false|
|shot_002|         false|           true|
|shot_003|          true|          false|
|pass_001|          true|           NULL|
+--------+--------------+---------------+


🔧 Converting boolean columns to integers...
  ✅ Converted under_pressure to integer
  ✅ Converted shot_one_on_one to integer

✅ Result after conversion:
+--------+--------------+---------------+-------+
|      id|under_pressure|shot_one_on_one|pk_type|
+--------+--------------+---------------+-------+
|shot_001|             1|              0|      0|
|shot_002|             0|              1|      1|
|shot_003|             1|              0|      0|
|pass_001|             1|              0|      0|
+--------+--------------+---------------+-------+



In [12]:
# ================================================================================================
# CELL 11: Complete Pipeline Simulation
# ================================================================================================

def run_complete_pipeline_simulation():
    """Run the complete preprocessing pipeline"""
    print("🚀 COMPLETE PIPELINE SIMULATION")
    print("=" * 70)
    
    print("📋 Pipeline Steps:")
    steps = [
        "1. Initialize data and filter",
        "2. Split location coordinates",
        "3. Calculate distance to goal",
        "4. Calculate shot angles",
        "5. Determine preferred foot",
        "6. Process freeze frame data",
        "7. Create dummy variables",
        "8. Convert booleans to integers",
        "9. Final data preparation"
    ]
    
    for step in steps:
        print(f"  {step}")
    
    print("\n" + "=" * 70)
    
    # Simulate the complete pipeline
    print("🎯 STEP 1: Data Initialization")
    print(f"✅ Starting with {df.count()} records")
    
    print("\n🎯 STEP 2-4: Spatial Data Processing")
    processed_df = df_final
    
    # Add shot angle (using UDF simulation)
    shot_angle_udf = udf(lambda x, y: PreprocessingDemo.shot_angle(x, y) if x and y else 0.0, FloatType())
    processed_df = processed_df.withColumn("shot_angle", shot_angle_udf(col("shot_location_x"), col("shot_location_y")))
    
    print("✅ Spatial features added")
    
    print("\n🎯 STEP 5: Preferred Foot Analysis")
    # Add preferred foot (simplified)
    processed_df = processed_df.withColumn("preferred_foot_shot", lit(1))  # Simplified
    
    print("✅ Preferred foot analysis completed")
    
    print("\n🎯 STEP 6: Freeze Frame Processing")
    # Add players in area (simplified)
    processed_df = processed_df.withColumn("players_inside_area", 
                                          when(col("shot_type") == "Penalty", 1).otherwise(2))
    
    print("✅ Freeze frame data processed")
    
    print("\n🎯 STEP 7-8: Feature Engineering")
    # Add assist data (simplified)
    processed_df = processed_df.withColumn("assisted", 
                                          when(col("pass_assisted_shot_id").isNotNull(), 1).otherwise(0)) \
                               .withColumn("pass_height", 
                                          when(col("pass_height").isNull(), -1).otherwise(2)) \
                               .withColumn("pass_angle", 
                                          when(col("pass_angle").isNull(), 4).otherwise(col("pass_angle"))) \
                               .withColumn("pass_length", 
                                          when(col("pass_length").isNull(), 0).otherwise(col("pass_length")))
    
    # Add StatsBomb prediction conversion
    processed_df = processed_df.withColumn("sb_prediction", 
                                          when(col("shot_statsbomb_xg") >= 0.5, 1).otherwise(0))
    
    print("✅ All features engineered")
    
    print("\n🎯 STEP 9: Final Data Preparation")
    # Filter to shots only and select final variables
    final_df = processed_df.filter(col("type") == "Shot")
    
    # Select only the variables we need
    available_vars = [var for var in VARIABLES if var in final_df.columns]
    final_df = final_df.select(available_vars)
    
    print(f"✅ Final dataset ready with {final_df.count()} records and {len(final_df.columns)} features")
    
    print("\n📊 FINAL DATASET PREVIEW:")
    final_df.show(truncate=False)
    
    return final_df

# Run the complete simulation
final_processed_df = run_complete_pipeline_simulation()

🚀 COMPLETE PIPELINE SIMULATION
📋 Pipeline Steps:
  1. Initialize data and filter
  2. Split location coordinates
  3. Calculate distance to goal
  4. Calculate shot angles
  5. Determine preferred foot
  6. Process freeze frame data
  7. Create dummy variables
  8. Convert booleans to integers
  9. Final data preparation

🎯 STEP 1: Data Initialization
✅ Starting with 4 records

🎯 STEP 2-4: Spatial Data Processing
✅ Spatial features added

🎯 STEP 5: Preferred Foot Analysis
✅ Preferred foot analysis completed

🎯 STEP 6: Freeze Frame Processing
✅ Freeze frame data processed

🎯 STEP 7-8: Feature Engineering
✅ All features engineered

🎯 STEP 9: Final Data Preparation
✅ Final dataset ready with 3 records and 18 features

📊 FINAL DATASET PREVIEW:


  🎯 Calculating shot angle for position (30.0, 40.0)                           
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (90.0, -4.0)
  📐 Vector to upper post: (90.0, 4.0)
  🔢 Dot product: 8084.00
  🔢 Magnitudes: 90.09, 90.09
  📊 Final angle: 5.09°


+--------+----+----------------+----------+------+-------------------+-------------------+--------+-----------+----------+-----------+---------+---------+------+-------+--------------+---------------+-------------+
|id      |goal|distance_to_goal|shot_angle|header|preferred_foot_shot|players_inside_area|assisted|pass_height|pass_angle|pass_length|open_play|free_kick|corner|pk_type|under_pressure|shot_one_on_one|sb_prediction|
+--------+----+----------------+----------+------+-------------------+-------------------+--------+-----------+----------+-----------+---------+---------+------+-------+--------------+---------------+-------------+
|shot_001|1   |90.0            |5.0896087 |0     |1                  |2                  |0       |-1         |4.0       |0.0        |1        |0        |0     |0      |1             |0              |1            |
|shot_002|0   |12.0            |36.869896 |0     |1                  |1                  |0       |-1         |4.0       |0.0        |0     

  🎯 Calculating shot angle for position (114.0, 42.0)
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (6.0, -6.0)
  📐 Vector to upper post: (6.0, 2.0)
  🔢 Dot product: 24.00
  🔢 Magnitudes: 8.49, 6.32
  📊 Final angle: 63.43°
  🎯 Calculating shot angle for position (108.0, 40.0)
  📍 Goal posts at (120, 36) and (120, 44)
  📐 Vector to lower post: (12.0, -4.0)
  📐 Vector to upper post: (12.0, 4.0)
  🔢 Dot product: 128.00
  🔢 Magnitudes: 12.65, 12.65
  📊 Final angle: 36.87°
