In [1]:
import gandula
from tqdm.auto import tqdm
import os
import sys
import pandas as pd
import numpy as np
from multiprocessing import Pool

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
project_root = os.path.abspath(
    os.path.join("C:/Users/jllgo/OneDrive/Documentos/UFMG/MSc & MSI 2/MatchSegmentation")
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [4]:
from src.data.loader import FramesLoader

In [5]:
DATA_PATH = '../data/raw/PL-22-23'
output_path = "../data/intermediate/PL-22-23"

In [6]:
games = os.listdir(DATA_PATH)
game_ids = [game.split('.')[0] for game in games]

In [7]:
frames_loader = FramesLoader(game_ids, DATA_PATH)
frames_loader.load(path=output_path)

Processing possessions:   0%|          | 0/190 [00:00<?, ?it/s]

# Data Exploration

In [8]:
metadata_full_df = pd.concat([frame_tuple[0] for frame_tuple in frames_loader.frames])
metadata_full_df = metadata_full_df.reset_index(drop=True)
metadata_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33209251 entries, 0 to 33209250
Data columns (total 19 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   match_id                int64  
 1   frame_id                int64  
 2   period                  int64  
 3   elapsed_seconds         float64
 4   home_has_possession     float64
 5   event_id                float64
 6   event_type              object 
 7   event_setpiece_type     object 
 8   event_player_id         float64
 9   event_team_id           float64
 10  event_start_frame       float64
 11  event_end_frame         float64
 12  possession_id           float64
 13  possession_type         object 
 14  possession_start_frame  float64
 15  possession_end_frame    float64
 16  sequence                float64
 17  version                 object 
 18  video_time_milli        float64
dtypes: float64(12), int64(3), object(4)
memory usage: 4.7+ GB


In [9]:
print(f"Total number of frames available: {len(metadata_full_df)}")

Total number of frames available: 33209251


In [10]:
metadata_reduced_df = pd.concat([frame_tuple[1] for frame_tuple in frames_loader.frames])
metadata_reduced_df = metadata_reduced_df.reset_index(drop=True)
metadata_reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5939050 entries, 0 to 5939049
Data columns (total 19 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   match_id                int64  
 1   frame_id                float64
 2   period                  int64  
 3   elapsed_seconds         float64
 4   home_has_possession     float64
 5   event_id                float64
 6   event_type              object 
 7   event_setpiece_type     object 
 8   event_player_id         float64
 9   event_team_id           float64
 10  event_start_frame       float64
 11  event_end_frame         float64
 12  possession_id           float64
 13  possession_type         object 
 14  possession_start_frame  float64
 15  possession_end_frame    float64
 16  sequence                float64
 17  version                 object 
 18  video_time_milli        float64
dtypes: float64(13), int64(2), object(4)
memory usage: 860.9+ MB


In [11]:
print(f"Total number of frames available: {len(metadata_reduced_df)}")

Total number of frames available: 5939050


# Filter Invalid Frames

In [12]:
df = metadata_reduced_df.sort_values(by=["match_id", "frame_id"]).reset_index(drop=True)

In [13]:
metadata_reduced_df[(metadata_reduced_df['event_id'].isnull() | metadata_reduced_df['possession_id'].isnull()) & metadata_reduced_df['home_has_possession'].isnull()]

Unnamed: 0,match_id,frame_id,period,elapsed_seconds,home_has_possession,event_id,event_type,event_setpiece_type,event_player_id,event_team_id,event_start_frame,event_end_frame,possession_id,possession_type,possession_start_frame,possession_end_frame,sequence,version,video_time_milli
93,4436,2090.0,1,16.549403,,4427708.0,OUT_OF_PLAY,,,,2090.0,2090.0,,,,,,4.1.0,69736.403
94,4436,2092.0,1,16.616136,,,,,,,,,,,,,,4.1.0,69803.136
95,4436,2098.0,1,16.816337,,,,,,,,,,,,,,4.1.0,70003.337
96,4436,2104.0,1,17.016537,,,,,,,,,,,,,,4.1.0,70203.537
97,4436,2110.0,1,17.216737,,,,,,,,,,,,,,4.1.0,70403.737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5939045,4625,184536.0,2,3015.081357,,,,,,,,,,,,,,4.1.0,6157357.357
5939046,4625,184542.0,2,3015.281558,,,,,,,,,,,,,,4.1.0,6157557.558
5939047,4625,184548.0,2,3015.481758,,,,,,,,,,,,,,4.1.0,6157757.758
5939048,4625,184554.0,2,3015.681958,,,,,,,,,,,,,,4.1.0,6157957.958


In [14]:
metadata_reduced_df[~metadata_reduced_df['event_setpiece_type'].isin(['nan', 'None'])]
#metadata_reduced_df['event_setpiece_type'].unique()

Unnamed: 0,match_id,frame_id,period,elapsed_seconds,home_has_possession,event_id,event_type,event_setpiece_type,event_player_id,event_team_id,event_start_frame,event_end_frame,possession_id,possession_type,possession_start_frame,possession_end_frame,sequence,version,video_time_milli
0,4436,1594.0,1,0.000000,1.0,4427560.0,FIRST_HALF_KICKOFF,KICK_OFF,1940.0,7.0,1594.0,1594.0,4245025.0,PASS,1594.0,1594.0,1.0,4.1.0,53186.520
1,4436,1594.0,1,0.000000,1.0,4427560.0,FIRST_HALF_KICKOFF,KICK_OFF,1940.0,7.0,1594.0,1594.0,4245025.0,PASS,1594.0,1594.0,1.0,4.1.0,53186.520
224,4436,2871.0,1,42.608796,0.0,4427712.0,ON_THE_BALL,THROW_IN,1916.0,2.0,2871.0,2871.0,4245186.0,PASS,2871.0,2871.0,2.0,4.1.0,95795.796
429,4436,3922.0,1,77.677198,1.0,4427748.0,ON_THE_BALL,THROW_IN,581.0,7.0,3922.0,3922.0,4245219.0,PASS,3922.0,3922.0,6.0,4.1.0,130864.198
803,4436,5872.0,1,142.742263,0.0,4427819.0,ON_THE_BALL,FREE_KICK,163.0,2.0,5872.0,5872.0,4245288.0,PASS,5872.0,5872.0,10.0,4.1.0,195929.263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5937873,4625,177868.0,2,2792.592202,0.0,6919051.0,ON_THE_BALL,THROW_IN,72.0,221.0,177868.0,177868.0,6809385.0,PASS,177868.0,177868.0,385.0,4.1.0,5934868.202
5938029,4625,178737.0,2,2821.587864,1.0,6919092.0,ON_THE_BALL,THROW_IN,396.0,16.0,178737.0,178737.0,6809423.0,PASS,178737.0,178737.0,387.0,4.1.0,5963863.864
5938175,4625,179513.0,2,2847.480423,1.0,6919141.0,ON_THE_BALL,THROW_IN,396.0,16.0,179513.0,179513.0,6809467.0,PASS,179513.0,179513.0,388.0,4.1.0,5989756.423
5938734,4625,182716.0,2,2954.353963,1.0,6919242.0,ON_THE_BALL,DROP_BALL,3972.0,16.0,182716.0,182716.0,6809573.0,PASS,182716.0,182716.0,395.0,4.1.0,6096629.963


## Events

In [15]:
metadata_reduced_df['event_id'].unique()

array([4427560.,      nan, 4427697., ..., 6919281., 6919285., 6919288.])

In [16]:
metadata_reduced_df['possession_type'].unique()


array(['PASS', 'nan', 'CHALLENGE', 'REBOUND', 'BALL_CARRY', 'SHOT',
       'CLEARANCE', 'CROSS'], dtype=object)

In [17]:
events_count = metadata_reduced_df[metadata_reduced_df['possession_type']!='nan'].groupby('possession_type')['possession_id'].nunique()
events_count.sort_values(ascending=False)

possession_type
PASS          179823
CHALLENGE      32901
CLEARANCE       8936
REBOUND         7719
CROSS           7530
BALL_CARRY      5526
SHOT            4926
Name: possession_id, dtype: int64

In [18]:
metadata_full_df['possession_type'].unique()

array(['PASS', 'nan', 'CHALLENGE', 'REBOUND', 'BALL_CARRY', 'SHOT',
       'CLEARANCE', 'CROSS'], dtype=object)

In [19]:
events_count = metadata_full_df[metadata_full_df['possession_type']!='nan'].groupby('possession_type')['possession_id'].nunique()
events_count.sort_values(ascending=False)

possession_type
PASS          179823
CHALLENGE      32901
CLEARANCE       8936
REBOUND         7719
CROSS           7530
BALL_CARRY      5526
SHOT            4926
Name: possession_id, dtype: int64