In [129]:
import pandas as pd
import ast

In [130]:
events_world_cup =pd.read_csv('../events_World_Cup.csv')
teams = pd.read_csv('../teams.csv')
tags2name = pd.read_csv('../tags2name.csv')
playerrank = pd.read_csv('../playerank.csv')
players = pd.read_csv('../players.csv')
matches_world_cup = pd.read_csv('../matches_World_Cup.csv')

Lets filter the unrequired columns

In [131]:
events_world_cup = events_world_cup[['subEventName', 'tags', 'playerId', 'matchId', 'eventName', 'teamId', 'eventSec', 'matchPeriod']]
players = players[['wyId', 'shortName']]
tags2name = tags2name[['Tag', 'Description']]
teams = teams[['wyId', 'officialName', 'type']]
playerrank = playerrank[['playerId', 'roleCluster']]


Start filtering the data

In [132]:
# Only national teams
teams = teams[teams['type'] == 'national']

# Only matches where France has played.
matches_world_cup = matches_world_cup[matches_world_cup['label'].str.contains("France")]

# Only events from the matches where France has played.
events_world_cup = events_world_cup[events_world_cup['matchId'].isin(matches_world_cup['wyId'])]

events_world_cup

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod
18813,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H
18814,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H
18815,Throw in,[{'id': 1801}],340646,2057966,Free Kick,4418,15.608867,1H
18816,Simple pass,[{'id': 1801}],209091,2057966,Pass,4418,16.385084,1H
18817,Launch,[{'id': 1802}],340646,2057966,Pass,4418,17.214485,1H
...,...,...,...,...,...,...,...,...
101751,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H
101752,Clearance,[{'id': 1802}],3309,2058017,Others on the ball,4418,2967.926784,2H
101753,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H
101754,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H


In [133]:
# Merge match name
data = pd.merge(events_world_cup, teams, left_on='teamId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId,officialName,type
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national
1,High pass,[{'id': 1802}],61395,2057966,Pass,8493,3.978396,1H,8493,Australia,national
2,Head pass,[{'id': 1802}],61425,2057966,Pass,8493,19.920463,1H,8493,Australia,national
3,High pass,[{'id': 1801}],62389,2057966,Pass,8493,26.371362,1H,8493,Australia,national
4,Air duel,"[{'id': 701}, {'id': 1802}]",16151,2057966,Duel,8493,27.942092,1H,8493,Australia,national
...,...,...,...,...,...,...,...,...,...,...,...
10838,Touch,[],69396,2058017,Others on the ball,9598,2960.803153,2H,9598,Croatia,national
10839,High pass,[{'id': 1802}],69396,2058017,Pass,9598,2964.715715,2H,9598,Croatia,national
10840,Throw in,[{'id': 1801}],69968,2058017,Free Kick,9598,2972.985039,2H,9598,Croatia,national
10841,Simple pass,[{'id': 1801}],3476,2058017,Pass,9598,2978.301867,2H,9598,Croatia,national


In [134]:
# Merge player name
data = pd.merge(data, players, left_on='playerId', right_on='wyId')
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,Simple pass,[{'id': 1801}],238055,2057966,Pass,8493,318.452504,1H,8493,Australia,national,238055,A. Nabbout
2,Ground loose ball duel,"[{'id': 701}, {'id': 1802}]",238055,2057966,Duel,8493,622.800480,1H,8493,Australia,national,238055,A. Nabbout
3,Head pass,[{'id': 1801}],238055,2057966,Pass,8493,1089.881235,1H,8493,Australia,national,238055,A. Nabbout
4,Air duel,"[{'id': 701}, {'id': 1802}]",238055,2057966,Duel,8493,1310.158067,1H,8493,Australia,national,238055,A. Nabbout
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,Simple pass,[{'id': 1801}],69411,2058017,Pass,9598,2888.451480,2H,9598,Croatia,national,69411,A. Kramari\u0107
10839,Ground attacking duel,"[{'id': 502}, {'id': 703}, {'id': 1801}]",135810,2058017,Duel,9598,2398.932619,2H,9598,Croatia,national,135810,M. Pjaca
10840,Simple pass,[{'id': 1802}],135810,2058017,Pass,9598,2400.002448,2H,9598,Croatia,national,135810,M. Pjaca
10841,Simple pass,[{'id': 1801}],135810,2058017,Pass,9598,2567.478676,2H,9598,Croatia,national,135810,M. Pjaca


In [135]:
# Merge tags

def map_tags_to_desc(tag_list):
    descriptions = []
    for tag in tag_list:
        tag_id = tag['id']
        description = tags2name.loc[tags2name['Tag'] == tag_id, 'Description'].values
        if len(description) > 0:
            descriptions.append(description[0])
    return descriptions

data['tags'] = data['tags'].apply(lambda x: map_tags_to_desc(ast.literal_eval(x)))
data

Unnamed: 0,subEventName,tags,playerId,matchId,eventName,teamId,eventSec,matchPeriod,wyId_x,officialName,type,wyId_y,shortName
0,Simple pass,[Accurate],238055,2057966,Pass,8493,1.435354,1H,8493,Australia,national,238055,A. Nabbout
1,Simple pass,[Accurate],238055,2057966,Pass,8493,318.452504,1H,8493,Australia,national,238055,A. Nabbout
2,Ground loose ball duel,"[Lost, Not accurate]",238055,2057966,Duel,8493,622.800480,1H,8493,Australia,national,238055,A. Nabbout
3,Head pass,[Accurate],238055,2057966,Pass,8493,1089.881235,1H,8493,Australia,national,238055,A. Nabbout
4,Air duel,"[Lost, Not accurate]",238055,2057966,Duel,8493,1310.158067,1H,8493,Australia,national,238055,A. Nabbout
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10838,Simple pass,[Accurate],69411,2058017,Pass,9598,2888.451480,2H,9598,Croatia,national,69411,A. Kramari\u0107
10839,Ground attacking duel,"[Free space left, Won, Accurate]",135810,2058017,Duel,9598,2398.932619,2H,9598,Croatia,national,135810,M. Pjaca
10840,Simple pass,[Not accurate],135810,2058017,Pass,9598,2400.002448,2H,9598,Croatia,national,135810,M. Pjaca
10841,Simple pass,[Accurate],135810,2058017,Pass,9598,2567.478676,2H,9598,Croatia,national,135810,M. Pjaca


In [136]:
data = data.drop(columns=['playerId', 'teamId', 'type', 'wyId_y', 'wyId_x'])

# Replace matchId with match label
data = data.merge(matches_world_cup[['wyId', 'label']], left_on='matchId', right_on='wyId', how='left')

data['matchId'] = data['label']
data = data.drop(columns=['label', 'wyId'])

data = data.rename(columns=
                   {'officialName': 'Team',
                    'shortName': 'Player',
                    'matchId' : 'Match',
                    'eventName': 'Event',
                    'tags': 'Tags', 
                    'matchPeriod': 'MatchPeriod',
                    'eventSec': 'EventSec',
                    'subEventName': 'SubEvent'}
                    )
data


Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,Simple pass,[Accurate],"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout
2,Ground loose ball duel,"[Lost, Not accurate]","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout
3,Head pass,[Accurate],"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout
4,Air duel,"[Lost, Not accurate]","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout
...,...,...,...,...,...,...,...,...
10838,Simple pass,[Accurate],"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107
10839,Ground attacking duel,"[Free space left, Won, Accurate]","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca
10840,Simple pass,[Not accurate],"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca
10841,Simple pass,[Accurate],"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca


In [137]:
data['Tags'] = data['Tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout
1,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout
2,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout
3,Head pass,Accurate,"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout
4,Air duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout
...,...,...,...,...,...,...,...,...
10838,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107
10839,Ground attacking duel,"Free space left, Won, Accurate","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca
10840,Simple pass,Not accurate,"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca


In [138]:
grouped  = data.groupby('Match').size().reset_index(name='count')

# Data per match
grouped

Unnamed: 0,Match,count
0,"Denmark - France, 0 - 0",1581
1,"France - Argentina, 4 - 3",1426
2,"France - Australia, 2 - 1",1514
3,"France - Belgium, 1 - 0",1588
4,"France - Croatia, 4 - 2",1459
5,"France - Peru, 1 - 0",1668
6,"Uruguay - France, 0 - 2",1607


In [139]:
# All relevant french players
french_players = data[data['Team'] == 'France']['Player'].unique().tolist()
french_players

['B. Pavard',
 'C. Tolisso',
 'R. Varane',
 'H. Lloris',
 'P. Pogba',
 'S. Umtiti',
 'A. Griezmann',
 'N. Kant\\u00e9',
 'K. Mbapp\\u00e9',
 'L. Hern\\u00e1ndez',
 'O. Demb\\u00e9l\\u00e9',
 'N. Fekir',
 'O. Giroud',
 'B. Matuidi',
 "S. N'Zonzi",
 'D. Sidib\\u00e9',
 'P. Kimpembe',
 'T. Lemar',
 'S. Mandanda',
 'B. Mendy',
 'F. Thauvin']

In [140]:
# Manually map the players to their respective postions
player_position_mapping = {
    'B. Pavard': 'RB',
    'C. Tolisso': 'CM',
    'R. Varane': 'CB',
    'H. Lloris': 'GKP',
    'P. Pogba': 'CM',
    'S. Umtiti': 'CB',
    'A. Griezmann': 'FW',
    'N. Kant\\u00e9': 'CDM',
    'K. Mbapp\\u00e9': 'FW',
    'L. Hern\\u00e1ndez': 'LB',
    'O. Demb\\u00e9l\\u00e9': 'RW',
    'N. Fekir': 'AM',
    'O. Giroud': 'FW',
    'B. Matuidi': 'LM',
    "S. N'Zonzi": 'CM',
    'D. Sidibé': 'RB',
    'P. Kimpembe': 'CB',
    'T. Lemar': 'LW',
    'S. Mandanda': 'GKP',
    'B. Mendy': 'LB',
    'F. Thauvin': 'RW',
    'D. Sidib\\u00e9': 'RB',
}

# Map the Position based on Player names
data['Position'] = data['Player'].map(player_position_mapping)

data[data['Team'] == 'France']

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
700,Throw in,Accurate,"France - Australia, 2 - 1",Free Kick,15.608867,1H,France,B. Pavard,RB
701,Launch,Not accurate,"France - Australia, 2 - 1",Pass,17.214485,1H,France,B. Pavard,RB
702,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,32.884277,1H,France,B. Pavard,RB
703,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,54.866585,1H,France,B. Pavard,RB
704,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,75.441589,1H,France,B. Pavard,RB
...,...,...,...,...,...,...,...,...,...
6020,Simple pass,Accurate,"Denmark - France, 0 - 0",Pass,2848.941221,2H,France,B. Mendy,LB
6021,Cross,"Left foot, Blocked, Not accurate","Denmark - France, 0 - 0",Pass,2853.514985,2H,France,B. Mendy,LB
6022,Throw in,Accurate,"Denmark - France, 0 - 0",Free Kick,2859.488457,2H,France,B. Mendy,LB
6023,Simple pass,Accurate,"France - Argentina, 4 - 3",Pass,2613.230156,2H,France,F. Thauvin,RW


In [141]:
data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout,
1,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout,
2,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout,
3,Head pass,Accurate,"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout,
4,Air duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout,
...,...,...,...,...,...,...,...,...,...
10838,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107,
10839,Ground attacking duel,"Free space left, Won, Accurate","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca,
10840,Simple pass,Not accurate,"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca,
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca,


In [142]:
first_half_data = data[data["MatchPeriod"] == "1H"]
first_half_data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout,
1,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout,
2,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout,
3,Head pass,Accurate,"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout,
4,Air duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout,
...,...,...,...,...,...,...,...,...,...
10800,Ground attacking duel,"Take on right, Neutral, Accurate","France - Croatia, 4 - 2",Duel,2630.282855,1H,Croatia,A. Rebi\u0107,
10801,Ground loose ball duel,"Neutral, Accurate","France - Croatia, 4 - 2",Duel,2631.917907,1H,Croatia,A. Rebi\u0107,
10802,Ground loose ball duel,"Neutral, Accurate","France - Croatia, 4 - 2",Duel,2764.799038,1H,Croatia,A. Rebi\u0107,
10803,Ground loose ball duel,"Lost, Not accurate","France - Croatia, 4 - 2",Duel,2765.454163,1H,Croatia,A. Rebi\u0107,


In [143]:
second_half_data = data[data["MatchPeriod"] == "2H"]
second_half_data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
11,Ground attacking duel,"Won, Accurate","France - Australia, 2 - 1",Duel,149.149446,2H,Australia,A. Nabbout,
12,Air duel,"Won, Accurate","France - Australia, 2 - 1",Duel,375.901094,2H,Australia,A. Nabbout,
13,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,437.526830,2H,Australia,A. Nabbout,
64,Free Kick,Accurate,"France - Australia, 2 - 1",Free Kick,175.327988,2H,Australia,T. Sainsbury,
65,Air duel,"Won, Accurate","France - Australia, 2 - 1",Duel,242.532037,2H,Australia,T. Sainsbury,
...,...,...,...,...,...,...,...,...,...
10838,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107,
10839,Ground attacking duel,"Free space left, Won, Accurate","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca,
10840,Simple pass,Not accurate,"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca,
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca,


In [144]:
data["SubEvent"].unique()

array(['Simple pass', 'Ground loose ball duel', 'Head pass', 'Air duel',
       'Ground attacking duel', 'Ground defending duel', 'Foul',
       'High pass', 'Touch', 'Free Kick', 'Launch', 'Clearance',
       'Free kick cross', 'Corner', 'Acceleration', 'Smart pass', 'Cross',
       'Throw in', 'Penalty', 'Reflexes', 'Save attempt', 'Hand pass',
       'Goal kick', 'Goalkeeper leaving line', 'Shot', 'Hand foul',
       'Free kick shot', nan, 'Time lost foul', 'Out of game foul'],
      dtype=object)

##We have shot, freekick shot, cross , freekick cross, corner in subevent which are more exciting for now.

In [145]:
data['SubEvent'] = data['SubEvent'].fillna('')
data

Unnamed: 0,SubEvent,Tags,Match,Event,EventSec,MatchPeriod,Team,Player,Position
0,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,1.435354,1H,Australia,A. Nabbout,
1,Simple pass,Accurate,"France - Australia, 2 - 1",Pass,318.452504,1H,Australia,A. Nabbout,
2,Ground loose ball duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,622.800480,1H,Australia,A. Nabbout,
3,Head pass,Accurate,"France - Australia, 2 - 1",Pass,1089.881235,1H,Australia,A. Nabbout,
4,Air duel,"Lost, Not accurate","France - Australia, 2 - 1",Duel,1310.158067,1H,Australia,A. Nabbout,
...,...,...,...,...,...,...,...,...,...
10838,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2888.451480,2H,Croatia,A. Kramari\u0107,
10839,Ground attacking duel,"Free space left, Won, Accurate","France - Croatia, 4 - 2",Duel,2398.932619,2H,Croatia,M. Pjaca,
10840,Simple pass,Not accurate,"France - Croatia, 4 - 2",Pass,2400.002448,2H,Croatia,M. Pjaca,
10841,Simple pass,Accurate,"France - Croatia, 4 - 2",Pass,2567.478676,2H,Croatia,M. Pjaca,


In [146]:
def sort_by_match(data):
    return data.sort_values(by=['Match', 'EventSec'])

first_half_data = sort_by_match(first_half_data)
second_half_data = sort_by_match(second_half_data)

We have finished our data preparation. Time to create an structured XES file

In [147]:
from datetime import datetime, timedelta

def create_xes_file_with_everything(data, filename="output.xes"):
    shouldFilter = True

    base_datetime = datetime(2020, 1, 1) # Arbitrary date for the timestamp

    # Open file to write
    with open(filename, 'w') as f:
        # Write XML declaration and log opening tag
        f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        f.write('<log xes.version="1.0" xes.features="nested-attributes" openxes.version="1.0RC7" xmlns="http://www.xes-standard.org/">\n')
        f.write('<classifier name="Activity_Resource" keys="concept:name org:resource"/>\n')
        f.write('<classifier name="Default" keys="concept:name"/>\n')
        
        # Initialize variables for tracking traces
        current_trace = []
        end_reason = None
        in_possession = False
        last_opponent_action = None  # To keep track of the last action by the opponent
        
        # Iterate over each row in the data
        for idx, row in data.iterrows():
            # Check if the team is "France"
            if row['Team'] == 'France':
                # Start a new trace if we're not in possession
                if not in_possession:
                    # If there was a previous trace, write it to the file
                    if current_trace:
                        # Apply filtering if shouldFilter is true
                        if not shouldFilter or len(current_trace) > 1:
                            start_reason = last_opponent_action  # Set start reason to last opponent action
                            end_reason = current_trace[-1]['Event']
                            
                            # Write trace to file
                            f.write('  <trace>\n')
                            f.write(f'    <string id="custom:startReason" value="{start_reason}" />\n')
                            f.write(f'    <string id="custom:endReason" value="{end_reason}" />\n')
                            
                            # Write each event in the trace
                            for event in current_trace:
                                event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                                event_timestamp = event_datetime.isoformat()  # ISO format for timesta

                                f.write('    <event>\n')
                                f.write(f'      <string key="org:resource" value="{event["Position"]}" />\n')
                                f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                                f.write(f'      <string key="custom:action" value="{event["Event"]}" />\n')
                                f.write(f'      <date key="time:timestamp" value="{event_timestamp}" />\n')
                                f.write(f'      <string key="custom:subevent" value="{event["SubEvent"]}" />\n')
                                f.write(f'      <string key="custom:matchperiod" value="{event["MatchPeriod"]}" />\n')
                                f.write(f'      <string key="custom:tags" value="{event["Tags"]}" />\n')
                                f.write(f'      <string key="custom:match" value="{event["Match"]}" />\n')

                                f.write('    </event>\n')
                            
                            # Close trace
                            f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = True
                
                # Add the current event to the trace
                current_trace.append({
                    'Player': row['Player'],
                    'Position': row['Position'],
                    'Event': row['Event'],
                    'EventSec': row['EventSec'],
                    'SubEvent': row['SubEvent'],
                    'MatchPeriod': row['MatchPeriod'],
                    'Tags': row['Tags'],
                    "Match": row['Match']
                })
                
            else:
                # If team is not France, complete the current trace
                if in_possession:
                    # Apply filtering if shouldFilter is true
                    if not shouldFilter or len(current_trace) > 1:
                        end_reason = current_trace[-1]['Event']
                        
                        # Write trace to file
                        f.write('  <trace>\n')
                        f.write(f'    <string key="custom:startReason" value="{last_opponent_action}" />\n')
                        f.write(f'    <string key="custom:endReason" value="{end_reason}" />\n')
                        
                        # Write each event in the trace
                        for event in current_trace:
                            event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                            event_timestamp = event_datetime.isoformat()  # ISO format for timestamp

                            f.write('    <event>\n')
                            f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                            f.write(f'      <string key="org:resource" value="{event["Position"]}" />\n')
                            f.write(f'      <string key="custom:action" value="{event["Event"]}" />\n')
                            f.write(f'      <date key="time:timeStamp" value="{event_timestamp}" />\n')
                            f.write(f'      <string key="custom:subevent" value="{event["SubEvent"]}" />\n')
                            f.write(f'      <string key="custom:matchperiod" value="{event["MatchPeriod"]}" />\n')
                            f.write(f'      <string key="custom:tags" value="{event["Tags"]}" />\n')
                            f.write(f'      <string key="custom:match" value="{event["Match"]}" />\n')

                            f.write('    </event>\n')
                        
                        # Close trace
                        f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = False
                
                # Keep track of the last action of the opponent
                last_opponent_action = row['Event']  # Update the last opponent action
        
        # Close the remaining trace if still in possession
        if current_trace:
            # Apply filtering if shouldFilter is true
            if not shouldFilter or len(current_trace) > 1:
                start_reason = last_opponent_action  # Use last opponent action as start reason
                end_reason = current_trace[-1]['Event']
                
                f.write('  <trace>\n')
                f.write(f'    <string key="custom:startReason" value="{start_reason}" />\n')
                f.write(f'    <string key="custom:endReason" value="{end_reason}" />\n')
                
                # Write each event in the trace
                for event in current_trace:
                    event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                    event_timestamp = event_datetime.isoformat() 
                    f.write('    <event>\n')
                    f.write(f'      <string key="org:resource" value="{event["Position"]}" />\n')
                    f.write(f'      <string key="concept:name" value="{event["Player"]}" />\n')
                    f.write(f'      <string key="custom:action" value="{event["Event"]}" />\n')
                    f.write(f'      <date key="time:timeStamp" value="{event_timestamp}" />\n')
                    f.write(f'      <string key="custom:subevent" value="{event["SubEvent"]}" />\n')
                    f.write(f'      <string key="custom:matchperiod" value="{event["MatchPeriod"]}" />\n')
                    f.write(f'      <string key="custom:tags" value="{event["Tags"]}" />\n')
                    f.write(f'      <string key="custom:match" value="{event["Match"]}" />\n')

                    f.write('    </event>\n')
                
                f.write('  </trace>\n')
        
        # Close log
        f.write('</log>\n')


In [None]:
from datetime import datetime, timedelta
import pandas as pd  # For NaN checking

def create_filtered_xes_file(data, filename="output.xes"):
    # Define the allowed subevents
    ALLOWED_SUBEVENTS = {
        "Shot",
        "Free kick shot",
        "Cross",
        "Free kick cross",
        "Corner"
    }

    def has_allowed_subevent(trace):
        """Check if any event in the trace has an allowed subevent"""
        for event in trace:
            subevent = event['SubEvent']
            # Check if subevent is NaN or not a string
            if pd.isna(subevent) or not isinstance(subevent, str):
                continue
                
            # Split subevents by space and check each one
            subevents = subevent.split(' ')
            if any(subevent in ALLOWED_SUBEVENTS for subevent in subevents):
                return True
        return False

    def safe_str(value):
        """Safely convert value to string, handling NaN"""
        if pd.isna(value):
            return ""
        return str(value)

    base_datetime = datetime(2020, 1, 1) # Arbitrary date for the timestamp

    # Open file to write
    with open(filename, 'w') as f:
        # Write XML declaration and log opening tag
        f.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        f.write('<log xes.version="1.0" xes.features="nested-attributes" openxes.version="1.0RC7" xmlns="http://www.xes-standard.org/">\n')
        f.write('<classifier name="Activity_Resource" keys="concept:name org:resource"/>\n')
        f.write('<classifier name="Default" keys="concept:name"/>\n')
        
        # Initialize variables for tracking traces
        current_trace = []
        in_possession = False
        last_opponent_action = None  # To keep track of the last action by the opponent
        
        # Iterate over each row in the data
        for idx, row in data.iterrows():
            # Check if the team is "France"
            if row['Team'] == 'France':
                # Start a new trace if we're not in possession
                if not in_possession:
                    # If there was a previous trace, write it to the file
                    if current_trace:
                        # Only write trace if it contains an allowed subevent
                        if has_allowed_subevent(current_trace):
                            end_reason = current_trace[-1]['Event']
                            start_reason = last_opponent_action  # Set start reason to last opponent action
                            
                            # Write trace to file
                            f.write('  <trace>\n')
                            f.write(f'    <string key="custom:startReason" value="{safe_str(start_reason)}" />\n')
                            f.write(f'    <string key="custom:endReason" value="{safe_str(end_reason)}" />\n')
                            
                            # Write each event in the trace
                            for event in current_trace:
                                event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                                event_timestamp = event_datetime.isoformat()  # ISO format for timestamp

                                f.write('    <event>\n')
                                f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                                f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                                f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                                f.write(f'      <date key="time:timestamp" value="{event_timestamp}" />\n')
                                f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                                f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                                f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                                f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                                f.write('    </event>\n')
                            
                            # Close trace
                            f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = True
                
                # Add the current event to the trace
                current_trace.append({
                    'Player': row['Player'],
                    'Position': row['Position'],
                    'Event': row['Event'],
                    'EventSec': row['EventSec'],
                    'SubEvent': row['SubEvent'],
                    'MatchPeriod': row['MatchPeriod'],
                    'Tags': row['Tags'],
                    "Match": row['Match']
                })
                
            else:
                # If team is not France, complete the current trace
                if in_possession:
                    if current_trace:
                        # Only write trace if it contains an allowed subevent
                        if has_allowed_subevent(current_trace):
                            end_reason = current_trace[-1]['Event']
                            
                            # Write trace to file
                            f.write('  <trace>\n')
                            f.write(f'    <string key="custom:startReason" value="{safe_str(last_opponent_action)}" />\n')
                            f.write(f'    <string key="custom:endReason" value="{safe_str(end_reason)}" />\n')
                            
                            # Write each event in the trace
                            for event in current_trace:
                                event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                                event_timestamp = event_datetime.isoformat()  # ISO format for timestamp

                                f.write('    <event>\n')
                                f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                                f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                                f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                                f.write(f'      <date key="time:timeStamp" value="{event_timestamp}" />\n')
                                f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                                f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                                f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                                f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                                f.write('    </event>\n')
                            
                            # Close trace
                            f.write('  </trace>\n')
                    
                    # Reset trace variables
                    current_trace = []
                    in_possession = False
                
                # Keep track of the last action of the opponent
                last_opponent_action = row['Event']  # Update the last opponent action
        
        # Close the remaining trace if still in possession
        if current_trace:
            # Only write trace if it contains an allowed subevent
            if has_allowed_subevent(current_trace):
                end_reason = current_trace[-1]['Event']
                start_reason = last_opponent_action  # Use last opponent action as start reason
                
                f.write('  <trace>\n')
                f.write(f'    <string key="custom:startReason" value="{safe_str(start_reason)}" />\n')
                f.write(f'    <string key="custom:endReason" value="{safe_str(end_reason)}" />\n')
                
                # Write each event in the trace
                for event in current_trace:
                    event_datetime = base_datetime + timedelta(seconds=event["EventSec"])
                    event_timestamp = event_datetime.isoformat() 
                    f.write('    <event>\n')
                    f.write(f'      <string key="org:resource" value="{safe_str(event["Position"])}" />\n')
                    f.write(f'      <string key="concept:name" value="{safe_str(event["Player"])}" />\n')
                    f.write(f'      <string key="custom:action" value="{safe_str(event["Event"])}" />\n')
                    f.write(f'      <date key="time:timeStamp" value="{event_timestamp}" />\n')
                    f.write(f'      <string key="custom:subevent" value="{safe_str(event["SubEvent"])}" />\n')
                    f.write(f'      <string key="custom:matchperiod" value="{safe_str(event["MatchPeriod"])}" />\n')
                    f.write(f'      <string key="custom:tags" value="{safe_str(event["Tags"])}" />\n')
                    f.write(f'      <string key="custom:match" value="{safe_str(event["Match"])}" />\n')
                    f.write('    </event>\n')
                
                f.write('  </trace>\n')
        
        # Close log
        f.write('</log>\n')

In [149]:
create_filtered_xes_file(first_half_data, "first_half_filtered.xes")
create_filtered_xes_file(second_half_data, "second_half_filtered.xes")

AttributeError: 'float' object has no attribute 'split'