In [668]:
import pandas as pd
import ast

In [669]:
events_world_cup =pd.read_csv('../events_World_Cup.csv')
teams = pd.read_csv('../teams.csv')
tags2name = pd.read_csv('../tags2name.csv')
playerrank = pd.read_csv('../playerank.csv')
players = pd.read_csv('../players.csv')
matches_world_cup = pd.read_csv('../matches_World_Cup.csv')

Lets filter the unrequired columns

In [670]:
events_world_cup = events_world_cup[['subEventName', 'tags', 'playerId', 'matchId', 'eventName', 'teamId', 'eventSec', 'matchPeriod']]
players = players[['wyId', 'shortName']]
tags2name = tags2name[['Tag', 'Description']]
teams = teams[['wyId', 'officialName', 'type']]
playerrank = playerrank[['playerId', 'roleCluster']]


Start filtering the data

In [671]:
# Only national teams
teams = teams[teams['type'] == 'national']

# Only matches where France has played.
matches_world_cup = matches_world_cup[matches_world_cup['label'].str.contains("France")]

# Only events from the matches where France has played.
events_world_cup = events_world_cup[events_world_cup['matchId'].isin(matches_world_cup['wyId'])]

events_world_cup

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod
18813,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H
18814,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H
18815,Throw in,[{'id': 1801}],340646,2057966,Free Kick,4418,15.608867,1H
18816,Simple pass,[{'id': 1801}],209091,2057966,Pass,4418,16.385084,1H
18817,Launch,[{'id': 1802}],340646,2057966,Pass,4418,17.214485,1H
...,...,...,...,...,...,...,...,...
101751,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H
101752,Clearance,[{'id': 1802}],3309,2058017,Others on the ball,4418,2967.926784,2H
101753,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H
101754,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H


In [672]:
# Merge match name
data = pd.merge(events_world_cup, teams, left_on='teamId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId,officialName,type
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national
1,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H,8493,Australia,national
2,Throw in,[{'id': 1801}],340646,2057966,Free Kick,4418,15.608867,1H,4418,France,national
3,Simple pass,[{'id': 1801}],209091,2057966,Pass,4418,16.385084,1H,4418,France,national
4,Launch,[{'id': 1802}],340646,2057966,Pass,4418,17.214485,1H,4418,France,national
...,...,...,...,...,...,...,...,...,...,...,...
10838,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H,9598,Croatia,national
10839,Clearance,[{'id': 1802}],3309,2058017,Others on the ball,4418,2967.926784,2H,4418,France,national
10840,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H,9598,Croatia,national
10841,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H,9598,Croatia,national


In [673]:
# Merge player name
data = pd.merge(data, players, left_on='playerId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H,8493,Australia,national,61395,T. Sainsbury
2,Throw in,[{'id': 1801}],340646,2057966,Free Kick,4418,15.608867,1H,4418,France,national,340646,B. Pavard
3,Simple pass,[{'id': 1801}],209091,2057966,Pass,4418,16.385084,1H,4418,France,national,209091,C. Tolisso
4,Launch,[{'id': 1802}],340646,2057966,Pass,4418,17.214485,1H,4418,France,national,340646,B. Pavard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H,9598,Croatia,national,69396,D. Vida
10839,Clearance,[{'id': 1802}],3309,2058017,Others on the ball,4418,2967.926784,2H,4418,France,national,3309,R. Varane
10840,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H,9598,Croatia,national,69968,M. Brozovi\u0107
10841,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H,9598,Croatia,national,3476,I. Rakiti\u0107


In [674]:
# Merge tags

def map_tags_to_desc(tag_list):
    descriptions = []
    for tag in tag_list:
        tag_id = tag['id']
        description = tags2name.loc[tags2name['Tag'] == tag_id, 'Description'].values
        if len(description) > 0:
            descriptions.append(description[0])
    return descriptions

data['tags'] = data['tags'].apply(lambda x: map_tags_to_desc(ast.literal_eval(x)))
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[Accurate],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,High pass,[Not accurate],61395,2057966,Pass,8493,3.978396,1H,8493,Australia,national,61395,T. Sainsbury
2,Throw in,[Accurate],340646,2057966,Free Kick,4418,15.608867,1H,4418,France,national,340646,B. Pavard
3,Simple pass,[Accurate],209091,2057966,Pass,4418,16.385084,1H,4418,France,national,209091,C. Tolisso
4,Launch,[Not accurate],340646,2057966,Pass,4418,17.214485,1H,4418,France,national,340646,B. Pavard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,High pass,[Not accurate],69396,2058017,Pass,9598,2964.715715,2H,9598,Croatia,national,69396,D. Vida
10839,Clearance,[Not accurate],3309,2058017,Others on the ball,4418,2967.926784,2H,4418,France,national,3309,R. Varane
10840,Throw in,[Accurate],69968,2058017,Free Kick,9598,2972.985039,2H,9598,Croatia,national,69968,M. Brozovi\u0107
10841,Simple pass,[Accurate],3476,2058017,Pass,9598,2978.301867,2H,9598,Croatia,national,3476,I. Rakiti\u0107


In [675]:
data = data.drop(columns=['playerId', 'teamId', 'type', 'wyId_y', 'wyId_x'])

# Replace matchId with match label
data = data.merge(matches_world_cup[['wyId', 'label']], left_on='matchId', right_on='wyId', how='left')

data['matchId'] = data['label']
data = data.drop(columns=['label', 'wyId'])

data = data.rename(columns=
                   {'officialName': 'Team',
                    'shortName': 'Player',
                    'matchId' : 'Match',
                    'eventName': 'Event',
                    'tags': 'Tags', 
                    'matchPeriod': 'MatchPeriod',
                    'eventSec': 'EventSec',
                    'subEventName': 'SubEvent'}
                    )
data


Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,High pass,[Not accurate],"France - Australia, 2 - 1",Pass,3.978396,1H,Australia,T. Sainsbury
2,Throw in,[Accurate],"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard
3,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,16.385084,1H,France,C. Tolisso
4,Launch,[Not accurate],"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard
...,...,...,...,...,...,...,...,...
10838,High pass,[Not accurate],"France - Croatia, 4 - 2",Pass,2964.715715,2H,Croatia,D. Vida
10839,Clearance,[Not accurate],"France - Croatia, 4 - 2",Others on the ball,2967.926784,2H,France,R. Varane
10840,Throw in,[Accurate],"France - Croatia, 4 - 2",Free Kick,2972.985039,2H,Croatia,M. Brozovi\u0107
10841,Simple pass,[Accurate],"France - Croatia, 4 - 2",Pass,2978.301867,2H,Croatia,I. Rakiti\u0107


In [676]:
data['Tags'] = data['Tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,High pass,Not accurate,"France - Australia, 2 - 1",Pass,3.978396,1H,Australia,T. Sainsbury
2,Throw in,Accurate,"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard
3,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,16.385084,1H,France,C. Tolisso
4,Launch,Not accurate,"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard
...,...,...,...,...,...,...,...,...
10838,High pass,Not accurate,"France - Croatia, 4 - 2",Pass,2964.715715,2H,Croatia,D. Vida
10839,Clearance,Not accurate,"France - Croatia, 4 - 2",Others on the ball,2967.926784,2H,France,R. Varane
10840,Throw in,Accurate,"France - Croatia, 4 - 2",Free Kick,2972.985039,2H,Croatia,M. Brozovi\u0107
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2978.301867,2H,Croatia,I. Rakiti\u0107


In [677]:
grouped  = data.groupby('Match').size().reset_index(name='count')

# Data per match
grouped

Unnamed: 0,Match,count
0,"Denmark - France, 0 - 0",1581
1,"France - Argentina, 4 - 3",1426
2,"France - Australia, 2 - 1",1514
3,"France - Belgium, 1 - 0",1588
4,"France - Croatia, 4 - 2",1459
5,"France - Peru, 1 - 0",1668
6,"Uruguay - France, 0 - 2",1607


In [678]:
# All relevant french players
french_players = data[data['Team'] == 'France']['Player'].unique().tolist()
french_players

['B. Pavard',
 'C. Tolisso',
 'R. Varane',
 'H. Lloris',
 'P. Pogba',
 'S. Umtiti',
 'A. Griezmann',
 'N. Kant\\u00e9',
 'K. Mbapp\\u00e9',
 'L. Hern\\u00e1ndez',
 'O. Demb\\u00e9l\\u00e9',
 'N. Fekir',
 'O. Giroud',
 'B. Matuidi',
 "S. N'Zonzi",
 'D. Sidib\\u00e9',
 'P. Kimpembe',
 'T. Lemar',
 'S. Mandanda',
 'B. Mendy',
 'F. Thauvin']

In [679]:
# Manually map the players to their respective postions
player_position_mapping = {
    'B. Pavard': 'RB',
    'C. Tolisso': 'CM',
    'R. Varane': 'CB',
    'H. Lloris': 'GKP',
    'P. Pogba': 'CM',
    'S. Umtiti': 'CB',
    'A. Griezmann': 'FW',
    'N. Kant\\u00e9': 'CDM',
    'K. Mbapp\\u00e9': 'FW',
    'L. Hern\\u00e1ndez': 'LB',
    'O. Demb\\u00e9l\\u00e9': 'RW',
    'N. Fekir': 'AM',
    'O. Giroud': 'FW',
    'B. Matuidi': 'LM',
    "S. N'Zonzi": 'CM',
    'D. Sidibé': 'RB',
    'P. Kimpembe': 'CB',
    'T. Lemar': 'LW',
    'S. Mandanda': 'GKP',
    'B. Mendy': 'LB',
    'F. Thauvin': 'RW',
}

# Map the Position based on Player names
data['Position'] = data['Player'].map(player_position_mapping)

data[data['Team'] == 'France']

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
2,Throw in,Accurate,"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard,RB
3,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,16.385084,1H,France,C. Tolisso,CM
4,Launch,Not accurate,"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard,RB
6,Simple pass,"Interception, Accurate","France - Australia, 2 - 1",Pass,22.249307,1H,France,C. Tolisso,CM
7,Simple pass,Not accurate,"France - Australia, 2 - 1",Pass,24.397477,1H,France,R. Varane,CB
...,...,...,...,...,...,...,...,...,...
10831,Foul,,"France - Croatia, 4 - 2",Foul,2906.751747,2H,France,N. Fekir,AM
10832,Ground attacking duel,"Neutral, Accurate","France - Croatia, 4 - 2",Duel,2919.788786,2H,France,P. Pogba,CM
10834,Goal kick,,"France - Croatia, 4 - 2",Free Kick,2955.453225,2H,France,H. Lloris,GKP
10835,Air duel,"Won, Accurate","France - Croatia, 4 - 2",Duel,2957.224360,2H,France,N. Fekir,AM


In [680]:
def sort_data(data):
    data = data.copy()
    
    # Define custom sorting order for matchPeriod
    custom_order = {'1H': 0, '2H': 1}
    
    # Map custom order to matchPeriod column to create a temporary sorting column
    data.loc[:, 'matchPeriodSort'] = data['MatchPeriod'].map(custom_order)
    
    # Sort the DataFrame by matchPeriodSort and then by eventSec
    sorted_data = data.sort_values(by=['matchPeriodSort', 'EventSec'], ascending=[True, True])
    
    # Drop the temporary sorting column
    sorted_data.drop(columns=['matchPeriodSort'], inplace=True)
    
    # Reset index for better readability (optional)
    sorted_data.reset_index(drop=True, inplace=True)
    
    return sorted_data

In [681]:
vs_aus = data[data['Match'].str.contains('France - Australia')]
vs_aus = sort_data(vs_aus)
vs_aus

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout,
1,High pass,Not accurate,"France - Australia, 2 - 1",Pass,3.978396,1H,Australia,T. Sainsbury,
2,Throw in,Accurate,"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard,RB
3,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,16.385084,1H,France,C. Tolisso,CM
4,Launch,Not accurate,"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard,RB
...,...,...,...,...,...,...,...,...,...
1509,Ground attacking duel,"Won, Accurate","France - Australia, 2 - 1",Duel,2990.323961,2H,France,P. Pogba,CM
1510,Foul,,"France - Australia, 2 - 1",Foul,2992.024230,2H,Australia,M. Leckie,
1511,Free Kick,Accurate,"France - Australia, 2 - 1",Free Kick,3026.263514,2H,France,R. Varane,CB
1512,Air duel,"Neutral, Accurate","France - Australia, 2 - 1",Duel,3027.983947,2H,France,O. Giroud,FW


In [682]:
vs_peru = sort_data(data[data['Match'].str.contains('France - Peru')])
vs_peru

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Peru, 1 - 0",Pass,2.527078,1H,France,A. Griezmann,FW
1,High pass,Accurate,"France - Peru, 1 - 0",Pass,6.227078,1H,France,R. Varane,CB
2,Air duel,"Won, Accurate","France - Peru, 1 - 0",Duel,9.205856,1H,France,O. Giroud,FW
3,Air duel,"Lost, Not accurate","France - Peru, 1 - 0",Duel,9.228730,1H,Peru,Y. Yot\u00fan,
4,Clearance,Not accurate,"France - Peru, 1 - 0",Others on the ball,10.407986,1H,Peru,P. Aquino,
...,...,...,...,...,...,...,...,...,...
1663,Shot,"Left foot, Blocked, Opportunity, Not accurate","France - Peru, 1 - 0",Shot,2929.314293,2H,France,O. Giroud,FW
1664,Touch,Interception,"France - Peru, 1 - 0",Others on the ball,2930.841420,2H,Peru,M. Trauco,
1665,Corner,Accurate,"France - Peru, 1 - 0",Free Kick,2964.372190,2H,France,O. Demb\u00e9l\u00e9,RW
1666,Cross,"Left foot, Blocked, Not accurate","France - Peru, 1 - 0",Pass,2965.524523,2H,France,N. Fekir,AM


In [683]:
vs_Denmark = sort_data(data[data['Match'].str.contains('Denmark - France')])
vs_Denmark

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,1.246874,1H,France,A. Griezmann,FW
1,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,3.455642,1H,France,S. N'Zonzi,CM
2,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,6.026132,1H,France,R. Varane,CB
3,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,7.114949,1H,France,D. Sidib\u00e9,
4,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,9.901039,1H,France,R. Varane,CB
...,...,...,...,...,...,...,...,...,...
1576,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,2863.904251,2H,France,T. Lemar,LW
1577,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,2866.041182,2H,France,P. Kimpembe,CB
1578,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,2867.072029,2H,France,R. Varane,CB
1579,Simple pass,Not accurate,"Denmark - France, 0 - 0",Pass,2871.569952,2H,France,D. Sidib\u00e9,


In [684]:
vs_Argentina = sort_data(data[data['Match'].str.contains('France - Argentina')])
vs_Argentina

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,1.302651,1H,France,A. Griezmann,FW
1,High pass,Accurate,"France - Argentina, 4 - 3",Pass,5.377012,1H,France,R. Varane,CB
2,Air duel,"Lost, Not accurate","France - Argentina, 4 - 3",Duel,7.507633,1H,France,O. Giroud,FW
3,Air duel,"Won, Accurate","France - Argentina, 4 - 3",Duel,8.162743,1H,Argentina,N. Tagliafico,
4,Foul,,"France - Argentina, 4 - 3",Foul,8.823650,1H,France,O. Giroud,FW
...,...,...,...,...,...,...,...,...,...
1421,High pass,Accurate,"France - Argentina, 4 - 3",Pass,2988.340412,2H,Argentina,\u00c1. di Mar\u00eda,
1422,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,2990.990089,2H,Argentina,S. Ag\u00fcero,
1423,Cross,"Left foot, High, Accurate","France - Argentina, 4 - 3",Pass,2993.544963,2H,Argentina,M. Meza,
1424,Shot,"Right foot, Opportunity, Position: Out low rig...","France - Argentina, 4 - 3",Shot,2994.625835,2H,Argentina,\u00c1. di Mar\u00eda,


In [685]:
vs_Uruguay = sort_data(data[data['Match'].str.contains('Uruguay - France')])
# For some reason the data is not available for this match
vs_Uruguay

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,1.967609,1H,Uruguay,L. Su\u00e1rez,
1,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,3.135021,1H,Uruguay,L. Torreira,
2,Simple pass,Accurate,"Uruguay - France, 0 - 2",Pass,5.843974,1H,Uruguay,D. God\u00edn,
3,Ground attacking duel,"Neutral, Accurate","Uruguay - France, 0 - 2",Duel,8.283414,1H,Uruguay,C. Stuani,
4,Ground defending duel,"Neutral, Accurate","Uruguay - France, 0 - 2",Duel,9.620708,1H,France,B. Pavard,RB
...,...,...,...,...,...,...,...,...,...
1602,Ground defending duel,"Free space left, Lost, Not accurate","Uruguay - France, 0 - 2",Duel,2978.227815,2H,France,S. N'Zonzi,CM
1603,Cross,"Right foot, High, Not accurate","Uruguay - France, 0 - 2",Pass,2979.019737,2H,Uruguay,M. Vecino,
1604,Goalkeeper leaving line,,"Uruguay - France, 0 - 2",Goalkeeper leaving line,2980.125441,2H,France,H. Lloris,GKP
1605,Hand pass,Accurate,"Uruguay - France, 0 - 2",Pass,2991.058941,2H,France,H. Lloris,GKP


In [686]:
vs_Belgium = sort_data((data[data['Match'].str.contains('France - Belgium')]))
vs_Belgium

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,1.782790,1H,France,A. Griezmann,FW
1,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,4.838240,1H,France,R. Varane,CB
2,Simple pass,Accurate,"France - Belgium, 1 - 0",Pass,6.994144,1H,France,B. Pavard,RB
3,Ground attacking duel,"Take on right, Won, Accurate","France - Belgium, 1 - 0",Duel,8.982080,1H,France,K. Mbapp\u00e9,FW
4,Ground defending duel,"Take on left, Lost, Not accurate","France - Belgium, 1 - 0",Duel,9.282822,1H,Belgium,J. Vertonghen,
...,...,...,...,...,...,...,...,...,...
1583,Reflexes,"Position: Goal low right, Accurate","France - Belgium, 1 - 0",Save attempt,3019.159244,2H,Belgium,T. Courtois,
1584,Corner,Accurate,"France - Belgium, 1 - 0",Free Kick,3055.990566,2H,France,K. Mbapp\u00e9,FW
1585,Ground attacking duel,"Lost, Not accurate","France - Belgium, 1 - 0",Duel,3058.502171,2H,France,P. Pogba,CM
1586,Ground defending duel,"Won, Accurate","France - Belgium, 1 - 0",Duel,3059.808444,2H,Belgium,Y. Carrasco,


In [687]:
vs_Croatia = sort_data(data[data['Match'].str.contains('France - Croatia')])
vs_Croatia

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,,"France - Croatia, 4 - 2",Pass,1.892339,1H,Croatia,M. Mand\u017euki\u0107,
1,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,3.889375,1H,Croatia,M. Brozovi\u0107,
2,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,6.140946,1H,Croatia,L. Modri\u0107,
3,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,9.226570,1H,Croatia,\u0160. Vrsaljko,
4,Launch,Accurate,"France - Croatia, 4 - 2",Pass,12.658969,1H,Croatia,D. Suba\u0161i\u0107,
...,...,...,...,...,...,...,...,...,...
1454,High pass,Not accurate,"France - Croatia, 4 - 2",Pass,2964.715715,2H,Croatia,D. Vida,
1455,Clearance,Not accurate,"France - Croatia, 4 - 2",Others on the ball,2967.926784,2H,France,R. Varane,CB
1456,Throw in,Accurate,"France - Croatia, 4 - 2",Free Kick,2972.985039,2H,Croatia,M. Brozovi\u0107,
1457,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2978.301867,2H,Croatia,I. Rakiti\u0107,


We have finished our data preparation. Time to create an structured XES file

In [688]:
from datetime import datetime, timedelta

def create_xes_file(data, filename="output.xes", shouldFilter=True):

    base_datetime = datetime(2020, 1, 1) # Arbitrary date for the timestamp

    # Open file to write
    with open(filename, 'w') as f:
        # Write XML declaration and log opening tag
        f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        f.write('<log xes.version="1.0" xes.features="nested-attributes" openxes.version="1.0RC7" xmlns="http://www.xes-standard.org/">\n')
        
        # Initialize variables for tracking traces
        current_trace = []
        end_reason = None
        in_possession = False
        last_opponent_action = None  # To keep track of the last action by the opponent
        
        # Iterate over each row in the data
        for idx, row in data.iterrows():
            # Check if the team is "France"
            if row['Team'] == 'France':
                # Start a new trace if we're not in possession
                if not in_possession:
                    # If there was a previous trace, write it to the file
                    if current_trace:
                        # Apply filtering if shouldFilter is true
                        if not shouldFilter or len(current_trace) > 1:
                            start_reason = last_opponent_action  # Set start reason to last opponent action
                            end_reason = current_trace[-1]['Event']
                            
                            if end_reason == 'Shot':
                                event = current_trace[-1]
                                # print(current_trace[-1])
                                if 'Goal' in event['Tags'] and not 'not accurate' in event['Tags'] and 'GKP' not in event['Position']:
                                    current_trace = []
                                    in_possession = True
                                    print('lolcat')
                                    continue
                            
                            # Write trace to file
                            f.write('  <trace>\n')
                            f.write('    <event>\n')
                            f.write(f'      <string key="concept:name" value="startcause {start_reason}" />\n')
                            f.write('    </event>\n')
                              

                            
                            # Write each event in the trace
                            for event in current_trace:
                                event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                                event_timestamp = event_datetime.isoformat()  # ISO format for timesta

                                f.write('    <event>\n')
                                f.write(f'      <string key="Position" value="{event["Position"]}" />\n')
                                f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                                f.write(f'      <string key="Action" value="{event["Event"]}" />\n')
                                f.write(f'      <string key="time:timestamp" value="{event_timestamp}" />\n')
                                f.write(f'      <string key="SubEvent" value="{event["SubEvent"]}" />\n')
                                f.write(f'      <string key="MatchPeriod" value="{event["MatchPeriod"]}" />\n')
                                f.write(f'      <string key="Tags" value="{event["Tags"]}" />\n')
                                f.write('    </event>\n')
                            f.write('    <event>\n')
                            f.write(f'      <string key="concept:name" value="endcause {end_reason}" />\n')
                            f.write('    </event>\n')
                            # Close trace
                            f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = True
                
                # Add the current event to the trace
                current_trace.append({
                    'Player': row['Player'],
                    'Position': row['Position'],
                    'Event': row['Event'],
                    'EventSec': row['EventSec'],
                    'SubEvent': row['SubEvent'],
                    'MatchPeriod': row['MatchPeriod'],
                    'Tags': row['Tags']
                })
                
            else:
                # If team is not France, complete the current trace
                if in_possession:
                    # Apply filtering if shouldFilter is true
                    if not shouldFilter or len(current_trace) > 1:
                        end_reason = current_trace[-1]['Event']

                        if end_reason == 'Shot':
                            event = current_trace[-1]
                            # print(current_trace[-1])
                            if 'Goal' in event['Tags'] and not 'not accurate' in event['Tags'] and 'GKP' not in event['Position']:
                                print(event)
                                # current_trace = []
                                # in_possession = False
                                # last_opponent_action = row['Event']
                                # continue

                            #HERE ignore every trace that ends with france scoring goals
                        
                        # Write trace to file
                        f.write('  <trace>\n')
                        f.write('    <event>\n')
                        f.write(f'      <string key="concept:name" value="startcause {last_opponent_action}" />\n')
                        f.write('    </event>\n')
                        
                        
                        # Write each event in the trace
                        for event in current_trace:
                            event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                            event_timestamp = event_datetime.isoformat()  # ISO format for timestamp

                            f.write('    <event>\n')
                            f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                            f.write(f'      <string key="Position" value="{event["Position"]}" />\n')
                            f.write(f'      <string key="Action" value="{event["Event"]}" />\n')
                            f.write(f'      <string key="time:timeStamp" value="{event_timestamp}" />\n')
                            f.write(f'      <string key="SubEvent" value="{event["SubEvent"]}" />\n')
                            f.write(f'      <string key="MatchPeriod" value="{event["MatchPeriod"]}" />\n')
                            f.write(f'      <string key="Tags" value="{event["Tags"]}" />\n')
                            f.write('    </event>\n')
                        
                        f.write('    <event>\n')
                        f.write(f'      <string key="concept:name" value="endcause {end_reason}" />\n')
                        f.write('    </event>\n')
                        # Close trace
                        f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = False
                
                # Keep track of the last action of the opponent
                last_opponent_action = row['Event']  # Update the last opponent action
        
        # Close the remaining trace if still in possession
        if current_trace:
            # Apply filtering if shouldFilter is true
            if not shouldFilter or len(current_trace) > 1:
                start_reason = last_opponent_action  # Use last opponent action as start reason
                end_reason = current_trace[-1]['Event']


                if end_reason == 'Shot':
                    event = current_trace[-1]
                    # print(current_trace[-1])
                    if 'Goal' in event['Tags'] and not 'not accurate' in event['Tags'] and 'GKP' not in event['Position']:
                        print("lolcat")
                        return
                f.write('  <trace>\n')

                f.write('    <event>\n')
                f.write(f'      <string key="concept:name" value="startcause {start_reason}" />\n')
                f.write('    </event>\n')
                
                # Write each event in the trace
                for event in current_trace:
                    event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                    event_timestamp = event_datetime.isoformat() 
                    f.write('    <event>\n')
                    f.write(f'      <string key="Position" value="{event["Position"]}" />\n')
                    f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                    f.write(f'      <string key="Action" value="{event["Event"]}" />\n')
                    f.write(f'      <string key="time:timeStamp" value="{event_timestamp}" />\n')
                    f.write(f'      <string key="SubEvent" value="{event["SubEvent"]}" />\n')
                    f.write(f'      <string key="MatchPeriod" value="{event["MatchPeriod"]}" />\n')
                    f.write(f'      <string key="Tags" value="{event["Tags"]}" />\n')
                    f.write('    </event>\n')
                
                f.write('    <event>\n')
                f.write(f'      <string key="concept:name" value="endcause {end_reason}" />\n')
                f.write('    </event>\n')

                f.write('  </trace>\n')
        
        # Close log
        f.write('</log>\n')


In [689]:
file_path_prefix = 'data/extraEvents/'

In [690]:
# File for match vs australia
vs_aus_first_half = vs_aus[vs_aus['MatchPeriod'] == '1H']
vs_aus_second_half = vs_aus[vs_aus['MatchPeriod'] == '2H']

create_xes_file(vs_aus_first_half, filename=file_path_prefix + 'vs_Australia_1h.xes')
create_xes_file(vs_aus_second_half, filename=file_path_prefix + 'vs_Australia_2h.xes')

{'Player': 'K. Mbapp\\u00e9', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 91.06392000000004, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Right foot, Opportunity, Position: Goal center right, Accurate'}
{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 336.94078, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Right foot, Opportunity, Position: Goal low right, Accurate'}


In [691]:
vs_Argentina_first_half = vs_Argentina[vs_Argentina['MatchPeriod'] == '1H']
vs_Argentina_second_half = vs_Argentina[vs_Argentina['MatchPeriod'] == '2H']

create_xes_file(vs_Argentina_first_half, filename=file_path_prefix + 'vs_Argentina_1h.xes', shouldFilter=True)
create_xes_file(vs_Argentina_second_half, filename=file_path_prefix + 'vs_Argentina_2h.xes', shouldFilter=True)

{'Player': 'K. Mbapp\\u00e9', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 1369.3453680000002, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Goal, Right foot, Opportunity, Position: Goal low left, Accurate'}


In [692]:
vs_Denmark_first_half = vs_Denmark[vs_Denmark['MatchPeriod'] == '1H']
vs_Denmark_second_half = vs_Denmark[vs_Denmark['MatchPeriod'] == '2H']

create_xes_file(vs_Denmark_first_half, filename=file_path_prefix + 'vs_Denmark_1h.xes')
create_xes_file(vs_Denmark_second_half, filename=file_path_prefix + 'vs_Denmark_2h.xes')


{'Player': 'O. Giroud', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 862.21922, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Left foot, Opportunity, Position: Goal high left, Accurate'}
{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 2331.711533, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Left foot, Opportunity, Position: Goal center, Accurate'}
{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 339.2113479999998, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Left foot, Opportunity, Position: Goal low right, Accurate'}
{'Player': 'N. Fekir', 'Position': 'AM', 'Event': 'Shot', 'EventSec': 2212.6605550000004, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Left foot, Opportunity, Position: Goal low left, Accurate'}


In [693]:
vs_Peru_first_half = vs_peru[vs_peru['MatchPeriod'] == '1H']
vs_Peru_second_half = vs_peru[vs_peru['MatchPeriod'] == '2H']

create_xes_file(vs_Peru_first_half, filename=file_path_prefix + 'vs_Peru_1h.xes')
create_xes_file(vs_Peru_second_half, filename=file_path_prefix + 'vs_Peru_2h.xes')

{'Player': 'K. Mbapp\\u00e9', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 1978.651572, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Right foot, Opportunity, Position: Goal center, Accurate'}
{'Player': 'L. Hern\\u00e1ndez', 'Position': 'LB', 'Event': 'Shot', 'EventSec': 2549.927555, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Left foot, Opportunity, Position: Goal high center, Accurate'}


In [694]:
vs_Uruguay_first_half = vs_Uruguay[vs_Uruguay['MatchPeriod'] == '1H']
vs_Uruguay_second_half = vs_Uruguay[vs_Uruguay['MatchPeriod'] == '2H']

create_xes_file(vs_Uruguay_first_half, filename=file_path_prefix + 'vs_Uruguay_1h.xes')
create_xes_file(vs_Uruguay_second_half, filename=file_path_prefix + 'vs_Uruguay_2h.xes')

{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 947.2511969999996, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Goal, Counter attack, Left foot, Opportunity, Position: Goal center, Accurate'}


In [695]:
vs_Belgium_first_half = vs_Belgium[vs_Belgium['MatchPeriod'] == '1H']
vs_Belgium_second_half = vs_Belgium[vs_Belgium['MatchPeriod'] == '2H']

create_xes_file(vs_Belgium_first_half, filename=file_path_prefix + 'vs_Belgium_1h.xes')
create_xes_file(vs_Belgium_second_half, filename=file_path_prefix + 'vs_Belgium_2h.xes')

{'Player': 'B. Pavard', 'Position': 'RB', 'Event': 'Shot', 'EventSec': 2338.335249, 'SubEvent': 'Shot', 'MatchPeriod': '1H', 'Tags': 'Right foot, Opportunity, Position: Goal low center, Accurate'}
{'Player': 'S. Umtiti', 'Position': 'CB', 'Event': 'Shot', 'EventSec': 336.2704319999998, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Goal, Head/body, Opportunity, Position: Goal center right, Accurate'}
{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 2843.443873, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Right foot, Opportunity, Position: Goal low left, Accurate'}
{'Player': 'C. Tolisso', 'Position': 'CM', 'Event': 'Shot', 'EventSec': 3016.680937, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Left foot, Opportunity, Position: Goal low right, Accurate'}


In [696]:
vs_Croatia_first_half = vs_Croatia[vs_Croatia['MatchPeriod'] == '1H']
vs_Croatia_second_half = vs_Croatia[vs_Croatia['MatchPeriod'] == '2H']

create_xes_file(vs_Croatia_first_half, filename=file_path_prefix + 'vs_Croatia_1h.xes')
create_xes_file(vs_Croatia_second_half, filename=file_path_prefix + 'vs_Croatia_2h.xes')

{'Player': 'A. Griezmann', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 70.34117400000014, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Left foot, Opportunity, Position: Goal center, Accurate'}
{'Player': 'K. Mbapp\\u00e9', 'Position': 'FW', 'Event': 'Shot', 'EventSec': 1177.0585709999996, 'SubEvent': 'Shot', 'MatchPeriod': '2H', 'Tags': 'Goal, Right foot, Opportunity, Position: Goal low left, Accurate'}


In [697]:
vs_Argentina_first_half

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,1.302651,1H,France,A. Griezmann,FW
1,High pass,Accurate,"France - Argentina, 4 - 3",Pass,5.377012,1H,France,R. Varane,CB
2,Air duel,"Lost, Not accurate","France - Argentina, 4 - 3",Duel,7.507633,1H,France,O. Giroud,FW
3,Air duel,"Won, Accurate","France - Argentina, 4 - 3",Duel,8.162743,1H,Argentina,N. Tagliafico,
4,Foul,,"France - Argentina, 4 - 3",Foul,8.823650,1H,France,O. Giroud,FW
...,...,...,...,...,...,...,...,...,...
716,Touch,Interception,"France - Argentina, 4 - 3",Others on the ball,2794.683838,1H,Argentina,N. Tagliafico,
717,Throw in,Accurate,"France - Argentina, 4 - 3",Free Kick,2810.611296,1H,France,B. Pavard,RB
718,Ground defending duel,"Neutral, Accurate","France - Argentina, 4 - 3",Duel,2811.385867,1H,Argentina,\u00c9. Banega,
719,Ground attacking duel,"Neutral, Accurate","France - Argentina, 4 - 3",Duel,2811.475579,1H,France,P. Pogba,CM


In [698]:
len(data['Tags'].unique())

228

In [699]:
unique =vs_Argentina_first_half[vs_Argentina_first_half["Tags"].str.contains("Goal")]
unique

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
180,Penalty,"Goal, Left foot, Position: Goal low center, Ac...","France - Argentina, 4 - 3",Free Kick,761.289657,1H,France,A. Griezmann,FW
181,Reflexes,"Goal, Position: Goal low center, Not accurate","France - Argentina, 4 - 3",Save attempt,762.916099,1H,Argentina,F. Armani,
624,Shot,"Goal, Left foot, Opportunity, Position: Goal c...","France - Argentina, 4 - 3",Shot,2442.184087,1H,Argentina,\u00c1. di Mar\u00eda,
625,Save attempt,"Goal, Position: Goal center right, Not accurate","France - Argentina, 4 - 3",Save attempt,2443.727372,1H,France,H. Lloris,GKP


In [700]:
unique =vs_Argentina_second_half[vs_Argentina_second_half["Tags"].str.contains("Goal")]
unique

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
755,Shot,"Goal, Left foot, Interception, Opportunity, Po...","France - Argentina, 4 - 3",Shot,156.249344,2H,Argentina,G. Mercado,
756,Reflexes,"Goal, Position: Goal low center, Not accurate","France - Argentina, 4 - 3",Save attempt,156.591053,2H,France,H. Lloris,GKP
862,Shot,"Goal, Right foot, Opportunity, Position: Goal ...","France - Argentina, 4 - 3",Shot,708.161917,2H,France,B. Pavard,RB
863,Save attempt,"Goal, Position: Goal high left, Not accurate","France - Argentina, 4 - 3",Save attempt,710.308541,2H,Argentina,F. Armani,
943,Shot,"Goal, Left foot, Opportunity, Position: Goal l...","France - Argentina, 4 - 3",Shot,1088.431408,2H,France,K. Mbapp\u00e9,FW
944,Reflexes,"Goal, Position: Goal low center, Not accurate","France - Argentina, 4 - 3",Save attempt,1090.703787,2H,Argentina,F. Armani,
989,Shot,"Goal, Right foot, Opportunity, Position: Goal ...","France - Argentina, 4 - 3",Shot,1369.345368,2H,France,K. Mbapp\u00e9,FW
990,Reflexes,"Goal, Position: Goal low left, Not accurate","France - Argentina, 4 - 3",Save attempt,1371.299339,2H,Argentina,F. Armani,
1281,Shot,"Right foot, Opportunity, Position: Goal low ce...","France - Argentina, 4 - 3",Shot,2358.319979,2H,Argentina,L. Messi,
1282,Save attempt,"Position: Goal low center, Accurate","France - Argentina, 4 - 3",Save attempt,2358.748329,2H,France,H. Lloris,GKP
