In [1]:
import pandas as pd
import ast

In [2]:
events_world_cup =pd.read_csv('../events_World_Cup.csv')
teams = pd.read_csv('../teams.csv')
tags2name = pd.read_csv('../tags2name.csv')
playerrank = pd.read_csv('../playerank.csv')
players = pd.read_csv('../players.csv')
matches_world_cup = pd.read_csv('../matches_World_Cup.csv')

Lets filter the unrequired columns

In [3]:
events_world_cup = events_world_cup[['subEventName', 'tags', 'playerId', 'matchId', 'eventName', 'teamId', 'eventSec', 'matchPeriod']]
players = players[['wyId', 'shortName']]
tags2name = tags2name[['Tag', 'Description']]
teams = teams[['wyId', 'officialName', 'type']]
playerrank = playerrank[['playerId', 'roleCluster']]


Start filtering the data

In [4]:
# Only national teams
teams = teams[teams['type'] == 'national']

# Only matches where France has played.
matches_world_cup = matches_world_cup[matches_world_cup['label'].str.contains("France")]

# Only events from the matches where France has played.
events_world_cup = events_world_cup[events_world_cup['matchId'].isin(matches_world_cup['wyId'])]

events_world_cup

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod
18813,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H
18814,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H
18815,Throw in,[{'id': 1801}],340646,2057966,Free Kick,4418,15.608867,1H
18816,Simple pass,[{'id': 1801}],209091,2057966,Pass,4418,16.385084,1H
18817,Launch,[{'id': 1802}],340646,2057966,Pass,4418,17.214485,1H
...,...,...,...,...,...,...,...,...
101751,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H
101752,Clearance,[{'id': 1802}],3309,2058017,Others on the ball,4418,2967.926784,2H
101753,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H
101754,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H


In [5]:
# Merge match name
data = pd.merge(events_world_cup, teams, left_on='teamId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId,officialName,type
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national
1,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H,8493,Australia,national
2,Head pass,[{'id': 1802}],61425,2057966,Pass,8493,19.920463,1H,8493,Australia,national
3,High pass,[{'id': 1801}],62389,2057966,Pass,8493,26.371362,1H,8493,Australia,national
4,Air duel,"[{'id': 701}, {'id': 1802}]",16151,2057966,Duel,8493,27.942092,1H,8493,Australia,national
...,...,...,...,...,...,...,...,...,...,...,...
10838,Touch,[],69396,2058017,Others on the ball,9598,2960.803153,2H,9598,Croatia,national
10839,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H,9598,Croatia,national
10840,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H,9598,Croatia,national
10841,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H,9598,Croatia,national


In [6]:
# Merge player name
data = pd.merge(data, players, left_on='playerId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,318.452504,1H,8493,Australia,national,238055,A. Nabbout
2,Ground loose ball duel,"[{'id': 701}, {'id': 1802}]",238055,2057966,Duel,8493,622.800480,1H,8493,Australia,national,238055,A. Nabbout
3,Head pass,[{'id': 1801}],238055,2057966,Pass,8493,1089.881235,1H,8493,Australia,national,238055,A. Nabbout
4,Air duel,"[{'id': 701}, {'id': 1802}]",238055,2057966,Duel,8493,1310.158067,1H,8493,Australia,national,238055,A. Nabbout
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,Simple pass,[{'id': 1801}],69411,2058017,Pass,9598,2888.451480,2H,9598,Croatia,national,69411,A. Kramari\u0107
10839,Ground attacking duel,"[{'id': 502}, {'id': 703}, {'id': 1801}]",135810,2058017,Duel,9598,2398.932619,2H,9598,Croatia,national,135810,M. Pjaca
10840,Simple pass,[{'id': 1802}],135810,2058017,Pass,9598,2400.002448,2H,9598,Croatia,national,135810,M. Pjaca
10841,Simple pass,[{'id': 1801}],135810,2058017,Pass,9598,2567.478676,2H,9598,Croatia,national,135810,M. Pjaca


In [7]:
# Merge tags

def map_tags_to_desc(tag_list):
    descriptions = []
    for tag in tag_list:
        tag_id = tag['id']
        description = tags2name.loc[tags2name['Tag'] == tag_id, 'Description'].values
        if len(description) > 0:
            descriptions.append(description[0])
    return descriptions

data['tags'] = data['tags'].apply(lambda x: map_tags_to_desc(ast.literal_eval(x)))
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[Accurate],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,Simple pass,[Accurate],238055,2057966,Pass,8493,318.452504,1H,8493,Australia,national,238055,A. Nabbout
2,Ground loose ball duel,"[Lost, Not accurate]",238055,2057966,Duel,8493,622.800480,1H,8493,Australia,national,238055,A. Nabbout
3,Head pass,[Accurate],238055,2057966,Pass,8493,1089.881235,1H,8493,Australia,national,238055,A. Nabbout
4,Air duel,"[Lost, Not accurate]",238055,2057966,Duel,8493,1310.158067,1H,8493,Australia,national,238055,A. Nabbout
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,Simple pass,[Accurate],69411,2058017,Pass,9598,2888.451480,2H,9598,Croatia,national,69411,A. Kramari\u0107
10839,Ground attacking duel,"[Free space left, Won, Accurate]",135810,2058017,Duel,9598,2398.932619,2H,9598,Croatia,national,135810,M. Pjaca
10840,Simple pass,[Not accurate],135810,2058017,Pass,9598,2400.002448,2H,9598,Croatia,national,135810,M. Pjaca
10841,Simple pass,[Accurate],135810,2058017,Pass,9598,2567.478676,2H,9598,Croatia,national,135810,M. Pjaca


In [8]:
data = data.drop(columns=['playerId', 'teamId', 'type', 'wyId_y', 'wyId_x'])

# Replace matchId with match label
data = data.merge(matches_world_cup[['wyId', 'label']], left_on='matchId', right_on='wyId', how='left')

data['matchId'] = data['label']
data = data.drop(columns=['label', 'wyId'])

data = data.rename(columns=
                   {'officialName': 'Team',
                    'shortName': 'Player',
                    'matchId' : 'Match',
                    'eventName': 'Event',
                    'tags': 'Tags', 
                    'matchPeriod': 'MatchPeriod',
                    'eventSec': 'EventSec',
                    'subEventName': 'SubEvent'}
                    )
data


Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout
2,Ground loose ball duel,"[Lost, Not accurate]","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout
3,Head pass,[Accurate],"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout
4,Air duel,"[Lost, Not accurate]","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout
...,...,...,...,...,...,...,...,...
10838,Simple pass,[Accurate],"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107
10839,Ground attacking duel,"[Free space left, Won, Accurate]","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca
10840,Simple pass,[Not accurate],"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca
10841,Simple pass,[Accurate],"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca


In [9]:
data['Tags'] = data['Tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout
2,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout
3,Head pass,Accurate,"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout
4,Air duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout
...,...,...,...,...,...,...,...,...
10838,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107
10839,Ground attacking duel,"Free space left, Won, Accurate","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca
10840,Simple pass,Not accurate,"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca


In [10]:
grouped  = data.groupby('Match').size().reset_index(name='count')

# Data per match
grouped

Unnamed: 0,Match,count
0,"Denmark - France, 0 - 0",1581
1,"France - Argentina, 4 - 3",1426
2,"France - Australia, 2 - 1",1514
3,"France - Belgium, 1 - 0",1588
4,"France - Croatia, 4 - 2",1459
5,"France - Peru, 1 - 0",1668
6,"Uruguay - France, 0 - 2",1607


In [11]:
# All relevant french players
french_players = data[data['Team'] == 'France']['Player'].unique().tolist()
french_players

['B. Pavard',
 'C. Tolisso',
 'R. Varane',
 'H. Lloris',
 'P. Pogba',
 'S. Umtiti',
 'A. Griezmann',
 'N. Kant\\u00e9',
 'K. Mbapp\\u00e9',
 'L. Hern\\u00e1ndez',
 'O. Demb\\u00e9l\\u00e9',
 'N. Fekir',
 'O. Giroud',
 'B. Matuidi',
 "S. N'Zonzi",
 'D. Sidib\\u00e9',
 'P. Kimpembe',
 'T. Lemar',
 'S. Mandanda',
 'B. Mendy',
 'F. Thauvin']

In [12]:
# Manually map the players to their respective postions
player_position_mapping = {
    'B. Pavard': 'RB',
    'C. Tolisso': 'CM',
    'R. Varane': 'CB',
    'H. Lloris': 'GKP',
    'P. Pogba': 'CM',
    'S. Umtiti': 'CB',
    'A. Griezmann': 'AM',
    'N. Kant\\u00e9': 'CDM',
    'K. Mbapp\\u00e9': 'CF',
    'L. Hern\\u00e1ndez': 'LB',
    'O. Demb\\u00e9l\\u00e9': 'RW',
    'N. Fekir': 'AM',
    'O. Giroud': 'CF',
    'B. Matuidi': 'LM',
    "S. N'Zonzi": 'CM',
    'D. Sidibé': 'RB',
    'P. Kimpembe': 'CB',
    'T. Lemar': 'LW',
    'S. Mandanda': 'GKP',
    'B. Mendy': 'LB',
    'F. Thauvin': 'RW',
    'D. Sidib\\u00e9': 'RB',
}

# Map the Position based on Player names
data['Position'] = data['Player'].map(player_position_mapping)

data[data['Team'] == 'France']

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
700,Throw in,Accurate,"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard,RB
701,Launch,Not accurate,"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard,RB
702,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,32.884277,1H,France,B. Pavard,RB
703,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,54.866585,1H,France,B. Pavard,RB
704,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,75.441589,1H,France,B. Pavard,RB
...,...,...,...,...,...,...,...,...,...
6020,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,2848.941221,2H,France,B. Mendy,LB
6021,Cross,"Left foot, Blocked, Not accurate","Denmark - France, 0 - 0",Pass,2853.514985,2H,France,B. Mendy,LB
6022,Throw in,Accurate,"Denmark - France, 0 - 0",Free Kick,2859.488457,2H,France,B. Mendy,LB
6023,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,2613.230156,2H,France,F. Thauvin,RW


In [13]:
data['SubEvent'] = data['SubEvent'].fillna('')

In [14]:
data["Status"] = "DRAW"

# France vs Australia

# First goal scored by France
mask = (
    data['Match'].str.contains('France - Australia', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (13*60 ))
)
data.loc[mask, 'Status'] = 'LEADING'

# Second goal conceded by France
mask = (
    data['Match'].str.contains('France - Australia', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (17*60))
)
data.loc[mask, 'Status'] = 'DRAW'

mask = (
    data['Match'].str.contains('France - Australia', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (36*60))
)
data.loc[mask, 'Status'] = 'LEADING'
# France vs Denmark is already prepopulated with draw

# France v Peru

# First goal scored by France
mask = (
    data['Match'].str.contains('France - Peru', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (35*60 ))
)
data.loc[mask, 'Status'] = 'LEADING'
# France vs Argentina

#Scored
mask = (
    data['Match'].str.contains('France - Argentina', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (14*60 ))
)
data.loc[mask, 'Status'] = 'LEADING'

#Conceded
mask = (
    data['Match'].str.contains('France - Argentina', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (42*60 ))
)
data.loc[mask, 'Status'] = 'DRAW'

#Conceded
mask = (
    data['Match'].str.contains('France - Argentina', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (4*60 ))
)
data.loc[mask, 'Status'] = 'LOSING'

#Scored
mask = (
    data['Match'].str.contains('France - Argentina', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (13*60 ))
)
data.loc[mask, 'Status'] = 'DRAW'


#Scored
mask = (
    data['Match'].str.contains('France - Argentina', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (20*60 ))
)
data.loc[mask, 'Status'] = 'LEADING'
# Uruguay vs France

#Scored
mask = (
    data['Match'].str.contains('Uruguay - France', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (40*60 ))
)

mask = (
    data['Match'].str.contains('Uruguay - France', na=False) & 
    (data['MatchPeriod'] == '2H'))
data.loc[mask, 'Status'] = 'LEADING'
# Belgium vs France

#Scored
mask = (
    data['Match'].str.contains('France - Belgium', na=False) & 
    (data['MatchPeriod'] == '2H') & 
    (data['EventSec'] > (7*60 ))
)
data.loc[mask, 'Status'] = 'LEADING'
# France - Croatia

#Scored
mask = (
    data['Match'].str.contains('France - Croatia', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (20*60))
)
data.loc[mask, 'Status'] = 'LEADING'

#Conceeded
mask = (
    data['Match'].str.contains('France - Croatia', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (29*60))
)
data.loc[mask, 'Status'] = 'DRAW'

#Scored
mask = (
    data['Match'].str.contains('France - Croatia', na=False) & 
    (data['MatchPeriod'] == '1H') & 
    (data['EventSec'] > (39*60))
)
data.loc[mask, 'Status'] = 'LEADING'

#Scored
mask = (
    data['Match'].str.contains('France - Croatia', na=False) & 
    (data['MatchPeriod'] == '2H') 
)
data.loc[mask, 'Status'] = 'LEADING'

### Events before goals by given minute

In [15]:
def events_before_goal_by_minutes(data, opponent_team, minutes=1):  
    # Determine the times of all goals by France
    goal_times = data[
        data['Match'].str.contains(opponent_team, na=False) &
        (data['Tags'].str.contains('Goal,', na=False)) & 
        (data['Tags'].str.contains('Accurate', na=False) &
        (data['Team'] == 'France'))
    ][['EventSec', 'MatchPeriod']]
    
    if not goal_times.empty:
        events_before_goals = []
        
        for _, goal in goal_times.iterrows():
            goal_time = goal['EventSec']
            goal_period = goal['MatchPeriod']
            
            # Filter the events for the specified number of minutes before the goal (including the goal)
            events_before_goal = data[
                data['Match'].str.contains(opponent_team, na=False) &
                (data['MatchPeriod'] == goal_period) &
                (data['EventSec'] >= max(0, goal_time - minutes * 60)) & 
                (data['EventSec'] <= goal_time)
            ]
            
            events_before_goals.append(events_before_goal)
        
        # Concatenate all the events before each goal
        events_before_goal_df = pd.concat(events_before_goals, ignore_index=True)
        
        return events_before_goal_df.sort_values(by=['MatchPeriod', 'EventSec'])
    
    else:
        # Return an empty DataFrame if no goals are found
        return pd.DataFrame(columns=data.columns)

France - Denmark

In [16]:
events_denmark = events_before_goal_by_minutes(data, 'Denmark', minutes=2)
events_denmark

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status


France - Argentina

In [17]:
events_argentina = events_before_goal_by_minutes(data, 'Argentina', minutes=2)
events_argentina[events_argentina['Team'] == 'France']

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
37,Penalty,"Goal, Left foot, Position: Goal low center, Ac...","France - Argentina, 4 - 3",Free Kick,761.289657,1H,France,A. Griezmann,AM,DRAW
3,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,589.283334,2H,France,R. Varane,CB,LOSING
22,Simple pass,Not accurate,"France - Argentina, 4 - 3",Pass,593.816026,2H,France,K. Mbapp\u00e9,CF,LOSING
10,Ground defending duel,"Take on right, Anticipated, Won, Accurate","France - Argentina, 4 - 3",Duel,594.475207,2H,France,P. Pogba,CM,LOSING
0,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,597.481241,2H,France,B. Pavard,RB,LOSING
4,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,600.873199,2H,France,R. Varane,CB,LOSING
11,Touch,,"France - Argentina, 4 - 3",Others on the ball,602.492381,2H,France,S. Umtiti,CB,LOSING
12,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,607.552599,2H,France,S. Umtiti,CB,LOSING
26,Simple pass,Not accurate,"France - Argentina, 4 - 3",Pass,608.396092,2H,France,O. Giroud,CF,LOSING
15,Ground loose ball duel,"Neutral, Accurate","France - Argentina, 4 - 3",Duel,611.503109,2H,France,A. Griezmann,AM,LOSING


In [18]:
events_argentina_france = events_argentina[events_argentina['Team'] == 'France']

Uruguay - France

In [19]:
events_uruguay = events_before_goal_by_minutes(data, 'Uruguay', minutes=2)
events_uruguay

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
2,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,2251.713452,1H,France,R. Varane,CB,DRAW
5,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,2254.41937,1H,France,P. Pogba,CM,DRAW
0,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,2256.005491,1H,France,B. Pavard,RB,DRAW
11,Ground defending duel,"Take on right, Won, Accurate","Uruguay - France, 0 - 2",Duel,2256.804754,1H,Uruguay,D. God\u00edn,,DRAW
7,Ground attacking duel,"Take on left, Lost, Not accurate","Uruguay - France, 0 - 2",Duel,2256.816574,1H,France,K. Mbapp\u00e9,CF,DRAW
9,Touch,,"Uruguay - France, 0 - 2",Others on the ball,2258.435456,1H,Uruguay,L. Torreira,,DRAW
10,High pass,Accurate,"Uruguay - France, 0 - 2",Pass,2261.590996,1H,Uruguay,L. Torreira,,DRAW
3,Air duel,"Won, Accurate","Uruguay - France, 0 - 2",Duel,2264.57205,1H,France,R. Varane,CB,DRAW
12,Air duel,"Lost, Not accurate","Uruguay - France, 0 - 2",Duel,2264.638311,1H,Uruguay,C. Stuani,,DRAW
8,Simple pass,"Interception, Accurate","Uruguay - France, 0 - 2",Pass,2265.718192,1H,Uruguay,L. Su\u00e1rez,,DRAW


In [20]:
events_uruguay_france = events_uruguay[events_uruguay['Team'] == 'France']

Belgium - France

In [21]:
events_belgium = events_before_goal_by_minutes(data, 'Belgium', minutes=2)
events_belgium

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
30,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,217.492584,2H,Belgium,V. Kompany,,DRAW
22,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,217.780138,2H,Belgium,K. De Bruyne,,DRAW
31,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,222.226767,2H,Belgium,V. Kompany,,DRAW
29,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,227.243636,2H,Belgium,N. Chadli,,DRAW
25,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,228.502728,2H,Belgium,A. Witsel,,DRAW
19,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,231.274477,2H,Belgium,T. Alderweireld,,DRAW
32,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,233.662077,2H,Belgium,V. Kompany,,DRAW
26,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,235.986662,2H,Belgium,A. Witsel,,DRAW
18,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,239.595786,2H,Belgium,J. Vertonghen,,DRAW
33,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,242.153579,2H,Belgium,V. Kompany,,DRAW


In [22]:
events_belgium_france = events_belgium[events_belgium['Team'] == 'France']

France - Croatia

In [23]:
events_croatia = events_before_goal_by_minutes(data, 'Croatia', minutes=2)
events_croatia

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
42,Penalty,"Goal, Left foot, Position: Goal low left, Accu...","France - Croatia, 4 - 2",Free Kick,2278.895744,1H,France,A. Griezmann,AM,DRAW
20,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,696.186816,2H,Croatia,M. Mand\u017euki\u0107,,LEADING
27,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,702.212839,2H,Croatia,\u0160. Vrsaljko,,LEADING
38,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,704.050069,2H,Croatia,D. Lovren,,LEADING
22,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,707.191775,2H,Croatia,M. Brozovi\u0107,,LEADING
...,...,...,...,...,...,...,...,...,...,...
61,Ground attacking duel,"Take on left, Won, Accurate","France - Croatia, 4 - 2",Duel,1168.593984,2H,France,L. Hern\u00e1ndez,LB,LEADING
72,Ground defending duel,"Take on right, Sliding tackle, Lost, Not accurate","France - Croatia, 4 - 2",Duel,1168.593984,2H,Croatia,M. Mand\u017euki\u0107,,LEADING
62,Acceleration,Accurate,"France - Croatia, 4 - 2",Others on the ball,1170.985695,2H,France,L. Hern\u00e1ndez,LB,LEADING
63,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,1174.426589,2H,France,L. Hern\u00e1ndez,LB,LEADING


In [24]:
events_croatia_france = events_croatia[events_croatia['Team'] == 'France']

France - Australia

In [25]:
events_australia = events_before_goal_by_minutes(data, 'Australia', minutes=2)
events_australia

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
0,Penalty,"Goal, Left foot, Position: Goal center right, ...","France - Australia, 2 - 1",Free Kick,730.009243,2H,France,A. Griezmann,AM,DRAW


In [26]:
events_australia_france = events_australia[events_australia['Team'] == 'France']

France - Peru

In [27]:
events_peru = events_before_goal_by_minutes(data, 'Peru', minutes=2)
events_peru

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
28,Simple pass,Accurate,"France - Peru, 1 - 0",Pass,1907.657794,1H,France,L. Hern\u00e1ndez,LB,DRAW
15,Simple pass,Accurate,"France - Peru, 1 - 0",Pass,1910.006757,1H,France,N. Kant\u00e9,CDM,DRAW
5,Simple pass,Accurate,"France - Peru, 1 - 0",Pass,1912.559975,1H,France,R. Varane,CB,DRAW
0,Simple pass,Accurate,"France - Peru, 1 - 0",Pass,1913.757697,1H,France,B. Pavard,RB,DRAW
25,Simple pass,Not accurate,"France - Peru, 1 - 0",Pass,1917.304816,1H,France,K. Mbapp\u00e9,CF,DRAW
...,...,...,...,...,...,...,...,...,...,...
52,Ground loose ball duel,"Sliding tackle, Lost, Not accurate","France - Peru, 1 - 0",Duel,2019.978058,1H,Peru,P. Guerrero,,DRAW
10,Smart pass,"Key pass, Through, Accurate","France - Peru, 1 - 0",Pass,2022.354070,1H,France,P. Pogba,CM,DRAW
32,Shot,"Left foot, Blocked, Opportunity, Not accurate","France - Peru, 1 - 0",Shot,2023.580214,1H,France,O. Giroud,CF,DRAW
46,Touch,Interception,"France - Peru, 1 - 0",Others on the ball,2024.690943,1H,Peru,A. Rodr\u00edguez,,DRAW


In [28]:
events_peru_france = events_peru[events_peru['Team'] == 'France']

In [29]:
all_events_before_goals = pd.concat([events_argentina, events_uruguay, events_croatia, events_australia, events_belgium, events_peru], ignore_index=True)
all_events_before_goals

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
0,Penalty,"Goal, Left foot, Position: Goal low center, Ac...","France - Argentina, 4 - 3",Free Kick,761.289657,1H,France,A. Griezmann,AM,DRAW
1,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,589.283334,2H,France,R. Varane,CB,LOSING
2,Simple pass,Not accurate,"France - Argentina, 4 - 3",Pass,593.816026,2H,France,K. Mbapp\u00e9,CF,LOSING
3,Ground defending duel,"Take on right, Anticipated, Won, Accurate","France - Argentina, 4 - 3",Duel,594.475207,2H,France,P. Pogba,CM,LOSING
4,Ground attacking duel,"Take on left, Anticipation, Lost, Not accurate","France - Argentina, 4 - 3",Duel,594.837089,2H,Argentina,N. Tagliafico,,LOSING
...,...,...,...,...,...,...,...,...,...,...
343,Ground loose ball duel,"Sliding tackle, Lost, Not accurate","France - Peru, 1 - 0",Duel,2019.978058,1H,Peru,P. Guerrero,,DRAW
344,Smart pass,"Key pass, Through, Accurate","France - Peru, 1 - 0",Pass,2022.354070,1H,France,P. Pogba,CM,DRAW
345,Shot,"Left foot, Blocked, Opportunity, Not accurate","France - Peru, 1 - 0",Shot,2023.580214,1H,France,O. Giroud,CF,DRAW
346,Touch,Interception,"France - Peru, 1 - 0",Others on the ball,2024.690943,1H,Peru,A. Rodr\u00edguez,,DRAW


In [30]:
filtered_events = all_events_before_goals[
    ~(
        (all_events_before_goals['Event'] == 'Duel') & 
        (all_events_before_goals['Tags'].str.contains("Lost"))
    )
]

filtered_events

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position,Status
0,Penalty,"Goal, Left foot, Position: Goal low center, Ac...","France - Argentina, 4 - 3",Free Kick,761.289657,1H,France,A. Griezmann,AM,DRAW
1,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,589.283334,2H,France,R. Varane,CB,LOSING
2,Simple pass,Not accurate,"France - Argentina, 4 - 3",Pass,593.816026,2H,France,K. Mbapp\u00e9,CF,LOSING
3,Ground defending duel,"Take on right, Anticipated, Won, Accurate","France - Argentina, 4 - 3",Duel,594.475207,2H,France,P. Pogba,CM,LOSING
5,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,597.481241,2H,France,B. Pavard,RB,LOSING
...,...,...,...,...,...,...,...,...,...,...
342,Ground loose ball duel,"Won, Accurate","France - Peru, 1 - 0",Duel,2019.807640,1H,France,P. Pogba,CM,DRAW
344,Smart pass,"Key pass, Through, Accurate","France - Peru, 1 - 0",Pass,2022.354070,1H,France,P. Pogba,CM,DRAW
345,Shot,"Left foot, Blocked, Opportunity, Not accurate","France - Peru, 1 - 0",Shot,2023.580214,1H,France,O. Giroud,CF,DRAW
346,Touch,Interception,"France - Peru, 1 - 0",Others on the ball,2024.690943,1H,Peru,A. Rodr\u00edguez,,DRAW


In [31]:
all_events_before_goals['Match'].unique()

array(['France - Argentina, 4 - 3', 'Uruguay - France, 0 - 2',
       'France - Croatia, 4 - 2', 'France - Australia, 2 - 1',
       'France - Belgium, 1 - 0', 'France - Peru, 1 - 0'], dtype=object)

##We have shot, freekick shot, cross , freekick cross, corner in subevent which are more exciting for now.

In [32]:
def sort_by_match(data):
    return data.sort_values(by=['Match','MatchPeriod', 'EventSec'])

all_events_before_goals_sorted = sort_by_match(all_events_before_goals)
filtered_events_sorted = sort_by_match(filtered_events)


We have finished our data preparation. Time to create an structured XES file

In [33]:
from datetime import datetime, timedelta

def create_events_before_goal_xes_file(data, filename="output.xes", checkOn: str = "org:resource"):
    def safe_str(value):
        """Safely convert value to string, handling NaN."""
        if pd.isna(value):
            return ""
        return str(value)

    base_datetime = datetime(2020, 1, 1)  # Arbitrary base date for timestamps

    with open(filename, 'w') as f:
        # Write XML declaration and log opening tag
        f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        f.write('<log xes.version="1.0" xes.features="nested-attributes" openxes.version="1.0RC7" xmlns="http://www.xes-standard.org/">\n')
        f.write(f'<classifier name="Activity_Resource" keys="{checkOn}"/>\n')
        f.write('<classifier name="Default" keys="concept:name"/>\n')

        current_trace = []  # Collect events for the current trace
        last_opponent_action = None  # Store the last action by the opponent
        in_trace = False  # Track if we're inside a trace

        for idx, row in data.iterrows():
            if row['Team'] == 'France':
                # Add the event to the current trace
                current_trace.append({
                    'Player': row['Player'],
                    'Position': row['Position'],
                    'Event': row['Event'],
                    'EventSec': row['EventSec'],
                    'SubEvent': row['SubEvent'],
                    'MatchPeriod': row['MatchPeriod'],
                    'Tags': row['Tags'],
                    'Match': row['Match'],
                    'Status': row['Status'],
                })
                in_trace = True

                # End trace if the current event is a goal
                if "Goal" in safe_str(row['Tags']):
                    # Write the trace to the file
                    if current_trace:
                        f.write('  <trace>\n')
                        f.write(f'    <string key="custom:startReason" value="{safe_str(last_opponent_action)}" />\n')
                        f.write(f'    <string key="custom:endReason" value="{safe_str(row["Event"])}" />\n')

                        for event in current_trace:
                            event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                            event_timestamp = event_datetime.isoformat()

                            f.write('    <event>\n')
                            f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                            f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                            f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                            f.write(f'      <date key="time:timestamp" value="{event_timestamp}" />\n')
                            f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                            f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                            f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                            f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                            f.write(f'      <string key="custom:status" value="{safe_str(event["Status"])}" />\n')
                            f.write('    </event>\n')

                        f.write('  </trace>\n')

                    # Reset trace after writing
                    current_trace = []
                    in_trace = False
            else:
                # Handle events by the opponent
                if in_trace:
                    # Write the current trace before switching possession
                    if current_trace:
                        f.write('  <trace>\n')
                        f.write(f'    <string key="custom:startReason" value="{safe_str(last_opponent_action)}" />\n')
                        f.write(f'    <string key="custom:endReason" value="{safe_str(current_trace[-1]["Event"])}" />\n')

                        for event in current_trace:
                            event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                            event_timestamp = event_datetime.isoformat()

                            f.write('    <event>\n')
                            f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                            f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                            f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                            f.write(f'      <date key="time:timestamp" value="{event_timestamp}" />\n')
                            f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                            f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                            f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                            f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                            f.write(f'      <string key="custom:status" value="{safe_str(event["Status"])}" />\n')
                            f.write('    </event>\n')

                        f.write('  </trace>\n')

                    # Reset trace variables
                    current_trace = []
                    in_trace = False

                last_opponent_action = row['Event']

        # Write remaining trace if any
        if current_trace:
            f.write('  <trace>\n')
            f.write(f'    <string key="custom:startReason" value="{safe_str(last_opponent_action)}" />\n')
            f.write(f'    <string key="custom:endReason" value="{safe_str(current_trace[-1]["Event"])}" />\n')

            for event in current_trace:
                event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                event_timestamp = event_datetime.isoformat()

                f.write('    <event>\n')
                f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                f.write(f'      <date key="time:timestamp" value="{event_timestamp}" />\n')
                f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                f.write(f'      <string key="custom:status" value="{safe_str(event["Status"])}" />\n')
                f.write('    </event>\n')

            f.write('  </trace>\n')

        f.write('</log>\n')


In [34]:
create_events_before_goal_xes_file(sort_by_match(all_events_before_goals_sorted), filename="all_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_belgium), filename="belgium_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_argentina), filename="argentina_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_uruguay), filename="uruguay_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_croatia), filename="croatia_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_australia), filename="australia_events_before_goal.xes")
create_events_before_goal_xes_file(sort_by_match(events_peru), filename="peru_events_before_goal.xes")