In [1]:
import gandula
from tqdm.auto import tqdm
import os
import sys
import pandas as pd
import numpy as np
from multiprocessing import Pool

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
project_root = os.path.abspath(
    os.path.join("C:/Users/jllgo/OneDrive/Documentos/UFMG/MSc & MSI 2/MatchSegmentation")
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [4]:
from src.data.loader import FramesLoader

In [5]:
DATA_PATH = '../data/raw/PL-22-23'
output_path = "../data/intermediate/PL-22-23"

In [6]:
games = os.listdir(DATA_PATH)
game_ids = [game.split('.')[0] for game in games][:5]

In [7]:
frames_loader = FramesLoader(game_ids, DATA_PATH)
frames_loader.load(path=output_path)

Loading Games:   0%|          | 0/5 [00:00<?, ?it/s]

# Data Exploration

In [8]:
metadata_full_df = pd.concat([frame_tuple[0] for frame_tuple in frames_loader.frames])
metadata_full_df = metadata_full_df.reset_index(drop=True)
metadata_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840213 entries, 0 to 840212
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   match_id                840213 non-null  int64  
 1   frame_id                840213 non-null  int64  
 2   period                  840213 non-null  int64  
 3   elapsed_seconds         840213 non-null  float64
 4   home_has_possession     492053 non-null  float64
 5   event_id                193176 non-null  float64
 6   event_type              840213 non-null  object 
 7   event_setpiece_type     840213 non-null  object 
 8   event_player_id         185032 non-null  float64
 9   event_team_id           185074 non-null  float64
 10  event_start_frame       193176 non-null  float64
 11  event_end_frame         193176 non-null  float64
 12  possession_id           9934 non-null    float64
 13  possession_type         840213 non-null  object 
 14  possession_start_fra

In [9]:
print(f"Total number of frames available: {len(metadata_full_df)}")

Total number of frames available: 840213


In [10]:
metadata_reduced_df = pd.concat([frame_tuple[1] for frame_tuple in frames_loader.frames])
metadata_reduced_df = metadata_reduced_df.reset_index(drop=True)
metadata_reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191223 entries, 0 to 191222
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   match_id                191223 non-null  int64  
 1   frame_id                191223 non-null  int64  
 2   period                  191223 non-null  int64  
 3   elapsed_seconds         191223 non-null  float64
 4   home_has_possession     191191 non-null  float64
 5   event_id                190684 non-null  float64
 6   event_type              191223 non-null  object 
 7   event_setpiece_type     0 non-null       object 
 8   event_player_id         184527 non-null  float64
 9   event_team_id           184537 non-null  float64
 10  event_start_frame       190684 non-null  float64
 11  event_end_frame         190684 non-null  float64
 12  possession_id           9385 non-null    float64
 13  possession_type         191223 non-null  object 
 14  possession_start_fra

In [11]:
print(f"Total number of frames available: {len(metadata_reduced_df)}")

Total number of frames available: 191223


# Filter Invalid Frames

In [12]:
df = metadata_reduced_df.sort_values(by=["match_id", "frame_id"]).reset_index(drop=True)

In [13]:
df[df['event_id'].isna()]['event_id'].unique()

array([nan])

In [14]:
metadata_reduced_df['event_setpiece_type'].unique()

array([None], dtype=object)

## Events

In [27]:
metadata_reduced_df[metadata_reduced_df['sequence']==5].groupby('possession_type').size()

possession_type
BALL_CARRY      1
CHALLENGE       2
PASS           13
REBOUND         4
SHOT            2
nan           339
dtype: int64

In [15]:
metadata_reduced_df['event_id'].unique()

array([4435314., 4435318., 4435325., ..., 4475754., 4475758., 4475798.])

In [20]:
metadata_reduced_df['event_type'].unique()


array(['ON_THE_BALL', 'VIDEO_MISSING', 'nan', 'PBC_IN_PLAY', 'PLAYER_ON'],
      dtype=object)

In [16]:
metadata_reduced_df['possession_type'].unique()


array(['nan', 'PASS', 'CHALLENGE', 'REBOUND', 'SHOT', 'BALL_CARRY',
       'CROSS', 'CLEARANCE'], dtype=object)

In [17]:
events_count = metadata_reduced_df[metadata_reduced_df['possession_type']!='nan'].groupby('possession_type')['possession_id'].nunique()
events_count.sort_values(ascending=False)

possession_type
PASS          4476
CHALLENGE      932
CLEARANCE      256
REBOUND        198
BALL_CARRY     159
CROSS          150
SHOT           114
Name: possession_id, dtype: int64

In [18]:
metadata_full_df['possession_type'].unique()

array(['PASS', 'nan', 'CHALLENGE', 'REBOUND', 'SHOT', 'BALL_CARRY',
       'CROSS', 'CLEARANCE'], dtype=object)

In [19]:
events_count = metadata_full_df[metadata_full_df['possession_type']!='nan'].groupby('possession_type')['possession_id'].nunique()
events_count.sort_values(ascending=False)

possession_type
PASS          4887
CHALLENGE      934
CLEARANCE      259
CROSS          206
REBOUND        198
BALL_CARRY     159
SHOT           117
Name: possession_id, dtype: int64