# Code for adding xT to the possession chains, and Calculating xT values in the sliding time windows of two minutes used to represent the flow of momentum

In [None]:
#import the functions from the ball-event dataframe and the possession chains
import import_ipynb
import Event_Dataframe
from Event_Dataframe import * 

In [None]:
#load xT grid. calculated using code from the GitHub page of KU Leuven. https://github.com/ML-KULeuven/socceraction/blob/master/socceraction/xthreat.py
xT_grid = pd.read_csv('xT.csv', header = None)

In [None]:
#Definition for finding the xT at a specific location
import numpy as np
def xT_at_location(location_xy, xT_grid, field_dimensions):
    #get the x and y of the location and transform it to the dimensions from the tracking dataframe
    x, y = location_xy
    x = (x * 105) - 5250
    y = (y * 68) - 3400
    if abs(x)>field_dimensions[0]/2. or abs(y)>field_dimensions[1]/2.:
        return 0.0 # Position is off the field, xT is zero
    #calculate the xT at a specific location
    else:
        ny,nx = xT_grid.shape
        dx = field_dimensions[0]/float(nx)
        dy = field_dimensions[1]/float(ny)
        ix = (x+field_dimensions[0]/2.-0.0001)/dx
        iy = (y+field_dimensions[1]/2.-0.0001)/dy
        return xT_grid.iloc[int(iy)][int(ix)]
    

In [None]:
#function that returns the added xT of a action in a possession chain
def calculate_added_xT( half, pass_in_chain, chain_number, df_all_possessions, df_possession_chains, xT_grid):
    #find pass, defensive clearance or dribble start location
    pass_start_pos = np.array([float(df_all_possessions['x_origin'][df_possession_chains['Indices'][chain_number][pass_in_chain]]),float(df_all_possessions['y_origin'][df_possession_chains['Indices'][chain_number][pass_in_chain]])])
    #find pass, defensive clearance or dribble target location
    if df_all_possessions['type_id'][df_possession_chains['Indices'][chain_number][pass_in_chain]] == '1':
        pass_target_pos = np.array([float(df_all_possessions['x_destination'][df_possession_chains['Indices'][chain_number][pass_in_chain]]),float(df_all_possessions['y_destination'][df_possession_chains['Indices'][chain_number][pass_in_chain]])])
    else: 
        pass_target_pos = np.array([float(df_all_possessions['x_origin'][df_possession_chains['Indices'][chain_number][pass_in_chain]+1]),float(df_all_possessions['y_origin'][df_possession_chains['Indices'][chain_number][pass_in_chain]+1])])
            
    #find the xT-valus at the start and target location by using the earlier defined function
    xT_start = xT_at_location(pass_start_pos, xT_grid, field_dimensions = (10500, 6800))
    xT_target = xT_at_location(pass_target_pos, xT_grid, field_dimensions = (10500, 6800))

    
    #Calculate the added xT by substracting both variables
    xT_difference = xT_target - xT_start

    return xT_difference

In [None]:
#Calculate added xT for all possession chains in the first half
#Only for succesful passes, clearances and dribbles
#No calculations are made for goals and shots, since we want to know what the effect is of key moments on the flow of momentum.
#Taking key moments into account would therefore make no sense
def calculate_xT_first_half(df_fh, possession_fh, xT_grid):
    xT_whole_chain = []
    #for each possession chain, calculate the xT of each single action
    for i in range(len(df_fh)):
        xT_pass = []
        for j in range(len(df_fh['Type_ids'][i])):
            if df_fh['Type_ids'][i][j] == '1' or df_fh['Type_ids'][i][j] == '3' or df_fh['Type_ids'][i][j] == '12':
                xT = calculate_added_xT(1, j, i, possession_fh, df_fh, xT_grid)
                xT_pass.append(xT)
            else:
                xT_pass.append(0.)            
        xT_whole_chain.append(xT_pass)
    #calculate the total xT added by summing all the values of the single actions   
    added_xT = []
    for i in range(len(xT_whole_chain)):
        added_xT.append(sum(xT_whole_chain[i]))

    #add the new columns to the dataframe
    df_fh['xT'] = xT_whole_chain
    df_fh['Added xT'] = added_xT
    return df_fh

In [None]:
#Same as the previous calcualtions but for the second half
def calculate_xT_second_half(df_sh, possession_sh, xT_grid):
    xT_whole_chain = []
    #for each possession chain, calculate the xT of each single action
    for i in range(len(df_sh)):
        xT_pass = []
        for j in range(len(df_sh['Type_ids'][i])):
            if df_sh['Type_ids'][i][j] == '1' or df_sh['Type_ids'][i][j] == '3' or df_sh['Type_ids'][i][j] == '12':
                xT = calculate_added_xT(1, j, i, possession_sh, df_sh, xT_grid)
                xT_pass.append(xT)
            else:
                xT_pass.append(0.)            
        xT_whole_chain.append(xT_pass)
    #calculate the total xT added by summing all the values of the single passes   
    added_xT = []
    for i in range(len(xT_whole_chain)):
        added_xT.append(sum(xT_whole_chain[i]))

    #add the new columns to the dataframe
    df_sh['xT'] = xT_whole_chain
    df_sh['Added xT'] = added_xT
    return df_sh

In [None]:
#import relevant packages
import pandas as pd
import numpy as np
import scipy.signal as signal
import pandas as pd

In [None]:
#function for checking whether a red card is given in a match
def check_red_card(f24_file):
    tree = et.ElementTree(file = f24_file)
    gameFile = tree.getroot()
    for game in gameFile:
        #Iterate through each event
        for event in game:
            for qualifier in event:
                if qualifier.attrib.get("qualifier_id") == "32" or qualifier.attrib.get("qualifier_id") == "33":
                    return 'True'

In [None]:
#creating a file with the possession chains and added xT for all matches
from progressbar import ProgressBar
import os
import xml.etree.ElementTree as ET
import pickle
final_df = pd.DataFrame()
path = 'C:\\Users\\s153035\\Dropbox\\Lars Toonen\\Data'

#iterate over all files in the data
for folder1 in os.listdir(path):
    print(folder1)
    if folder1 == 'Seizoen 1819' or folder1 == 'Seizoen 2021':
        final_df = pd.DataFrame()
        for folder2 in os.listdir(path + '\\' + str(folder1)):
            pbar = ProgressBar()
            for filename in pbar(os.listdir(path + '\\' + str(folder1) + '\\'+ str(folder2))):
                match = filename
                for all_files in os.listdir(path + '\\' + str(folder1) + '\\' + str(folder2)+ '\\' + str(filename)):
                    if all_files.endswith('.dat'):
                        tracking_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2) + '\\'+ str(filename), all_files)
                    elif all_files.endswith('metadata.xml'):
                        metadata_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2) + '\\'+ str(filename), all_files)
                    elif 'f24' in all_files:
                        event_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2) + '\\'+ str(filename), all_files)
                #check if red card is given
                if check_red_card(event_file) == 'True':
                    pass
                #create dataframe for each match and add to the complete dataframe
                else:
                    try:
                        final_table = Event_Dataframe.create_event_dataframe(event_file, metadata_file)
                        df_fh, df_sh, possession_fh, possession_sh = Event_Dataframe.create_first_and_second_half_possession_chains(final_table)
                        xT_grid = pd.read_csv('xT.csv', header = None)
                        df_fh = calculate_xT_first_half(df_fh, possession_fh, xT_grid)
                        df_sh = calculate_xT_second_half(df_sh, possession_sh, xT_grid)
                        df_fh['Half'] = 1
                        df_sh['Half'] = 2
                        df_xT = pd.concat([df_fh, df_sh])
                        df_xT['Match'] = match
                        final_df = pd.concat([final_df, df_xT])
                    except:
                        pass

    #folder Seizoen 1920 was structured differently. Therefore, calculations for this folder are done separately
    elif folder1 == 'Seizoen 1920':
        final_df = pd.DataFrame()
        for folder2 in os.listdir(path + '\\' + str(folder1)):
            pbar = ProgressBar()
            for filename in pbar(os.listdir(path + '\\' + str(folder1) + '\\'+ str(folder2))):
                match = filename
                for files in os.listdir(path + '\\' + str(folder1) + '\\' + str(folder2)+ '\\' + str(filename)):
                    if 'f24' in files:
                        event_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2) + '\\'+ str(filename), files)
                        for all_files in  os.listdir(path + '\\' + str(folder1) + '\\' + str(folder2)+ '\\' + str(filename) + '\\' + str(filename) + '_TracDAT'):
                            if all_files.endswith('.dat'):
                                tracking_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2)+ '\\' + str(filename) + '\\' + str(filename) + '_TracDAT', all_files)
                            elif all_files.endswith('metadata.xml'):
                                metadata_file = os.path.join(path + '\\' + str(folder1) + '\\' + str(folder2)+ '\\' + str(filename) + '\\' + str(filename) + '_TracDAT', all_files)
                #check if red card is given
                if check_red_card(event_file) == 'True':
                    pass
                #create dataframe for each match and add to the complete dataframe
                else:
                    try:
                        final_table = Event_Dataframe.create_event_dataframe(event_file, metadata_file)
                        df_fh, df_sh, possession_fh, possession_sh = Event_Dataframe.create_first_and_second_half_possession_chains(final_table)
                        xT_grid = pd.read_csv('xT.csv', header = None)
                        df_fh = calculate_xT_first_half(df_fh, possession_fh, xT_grid)
                        df_sh = calculate_xT_second_half(df_sh, possession_sh, xT_grid)
                        df_fh['Half'] = 1
                        df_sh['Half'] = 2
                        df_xT = pd.concat([df_fh, df_sh])
                        print(len(df_xT))
                        df_xT['Match'] = match
                        final_df = pd.concat([final_df, df_xT])
                    except:
                        pass
                                    
    #create pickle file for the complete dataframe per season
    pickle.dump(final_df, open('all_event_xT' + str(folder1) + '.p', 'wb'))                
                        


In [None]:
#Calculate the average xT per half minute for all the matches
import pickle
from progressbar import ProgressBar
import pandas as pd

#read the files per season
files = ['all_event_xTSeizoen 1819.p', 'all_event_xTSeizoen 1920.p', 'all_event_xTSeizoen 2021.p']
final_df = pd.DataFrame()
for file in files:   
    infile = open(file,'rb')
    new_dict = pickle.load(infile)
    infile.close()
    matches = new_dict.Match.unique()
    pbar = ProgressBar()
    #iterate over each match in the data
    for game in pbar(matches):
        match = new_dict.loc[new_dict['Match'] == game]
        xT_per_minute_home = []
        xT_per_minute_away = []
        half = []
        minutes = []
        match_id = []
        #create first and second half dataframes
        first_half = match.loc[match['Half'] == 1]
        second_half = match.loc[match['Half'] == 2]
        #find the final minutes of both halves
        final_minute_fh = max(first_half.iloc[-1]['Minutes'])
        final_minute_sh = max(second_half.iloc[-1]['Minutes'])
        #first half calculations
        for i in range(0, 2 * int(final_minute_fh)+1):
            half.append(1)
            minutes.append(i/2)
            match_id.append(game)
            minute_xTh = []
            minute_xTa = []
            for j in range(len(first_half)):
                #check whether the row in the dataframe concerns the home or away team.
                #if home team: positively add xt to the home team list and negatively add xt to the away team list
                #elif away team: negatively add xt to the home team list and positively add xt to the away team list
                if first_half.iloc[j]['Team'] == 'Home': 
                    for x in range(len(first_half.iloc[j]['Minutes'])):
                        if (i/2 - int(i/2)) * 60 <= 29:
                            if int(first_half.iloc[j]['Minutes'][x]) == int(i/2) and int(first_half.iloc[j]['Second'][x]) < 30:
                                minute_xTh.append(first_half.iloc[j]['xT'][x])
                                minute_xTa.append(-first_half.iloc[j]['xT'][x])
                        else:
                            if int(first_half.iloc[j]['Minutes'][x]) == int(i/2) and int(first_half.iloc[j]['Second'][x]) >= 30:
                                minute_xTh.append(first_half.iloc[j]['xT'][x])
                                minute_xTa.append(-first_half.iloc[j]['xT'][x])                               
                else:
                    for x in range(len(first_half.iloc[j]['Minutes'])):
                        if (i/2 - int(i/2)) * 60 <= 29:
                            if int(first_half.iloc[j]['Minutes'][x]) == int(i/2) and int(first_half.iloc[j]['Second'][x]) < 30:
                                minute_xTh.append(-first_half.iloc[j]['xT'][x])
                                minute_xTa.append(first_half.iloc[j]['xT'][x])
                        else:
                            if int(first_half.iloc[j]['Minutes'][x]) == int(i/2) and int(first_half.iloc[j]['Second'][x]) >= 30:
                                minute_xTh.append(-first_half.iloc[j]['xT'][x])
                                minute_xTa.append(first_half.iloc[j]['xT'][x])    
            if len(minute_xTh) == 0:
                minute_xTh.append(0)
                minute_xTa.append(0)
            xT_per_minute_home.append(sum(minute_xTh))
            xT_per_minute_away.append(sum(minute_xTa))

        #same code for the second half
        for i in range(0, int(final_minute_sh) +1):
            half.append(2)
            minutes.append(45 + i/2)
            match_id.append(game)
            minute_xTh = []
            minute_xTa = []
            for j in range(len(second_half)):
                if second_half.iloc[j]['Team'] == 'Home': 
                    for x in range(len(second_half.iloc[j]['Minutes'])):
                        if (i/2 - int(i/2)) * 60 <= 29:
                            if int(second_half.iloc[j]['Minutes'][x]) == (45 + int(i/2)) and int(second_half.iloc[j]['Second'][x]) < 30:
                                minute_xTh.append(second_half.iloc[j]['xT'][x])
                                minute_xTa.append(-second_half.iloc[j]['xT'][x])
                        else:
                            if int(second_half.iloc[j]['Minutes'][x]) == (45 + int(i/2)) and int(second_half.iloc[j]['Second'][x]) >= 30:
                                minute_xTh.append(second_half.iloc[j]['xT'][x])
                                minute_xTa.append(-second_half.iloc[j]['xT'][x])                               
                else:
                    for x in range(len(second_half.iloc[j]['Minutes'])):
                        if (i/2 - int(i/2)) * 60 <= 29:
                            if int(second_half.iloc[j]['Minutes'][x]) == (45 + int(i/2)) and int(second_half.iloc[j]['Second'][x]) < 30:
                                minute_xTh.append(-second_half.iloc[j]['xT'][x])
                                minute_xTa.append(second_half.iloc[j]['xT'][x])
                        else:
                            if int(second_half.iloc[j]['Minutes'][x]) == (45 + int(i/2)) and int(second_half.iloc[j]['Second'][x]) >= 30:
                                minute_xTh.append(-second_half.iloc[j]['xT'][x])
                                minute_xTa.append(second_half.iloc[j]['xT'][x])    
            if len(minute_xTh) == 0:
                minute_xTh.append(0)
                minute_xTa.append(0)
            xT_per_minute_home.append(sum(minute_xTh))
            xT_per_minute_away.append(sum(minute_xTa))


        #create dataframe for the added xt of both teams in each half minute of the match
        data = {'Match': match_id,
                'Half': half,
               'Minute': minutes,
               'Added_xT_home_team': xT_per_minute_home,
               'Added_xT_away_team': xT_per_minute_away}
        df_new = pd.DataFrame(data)
        final_df = pd.concat([final_df,df_new])
                
#Create a pickle file for the added xt per half minute for the home and away team for all matches
pickle.dump(final_df, open('xT per minute (home and away) for all matches.p', 'wb'))                
        


In [None]:
#Create two-minute sliding windows for xt used for representing the flow of momentum
#import packages
import pickle
import pandas as pd
from progressbar import ProgressBar
infile = open('xT per minute (home and away) for all matches.p','rb')
new_dict = pickle.load(infile)
infile.close()
matches = new_dict.Match.unique()
final_df = pd.DataFrame()
pbar = ProgressBar()
#iterate over each match
for game in pbar(matches):
    match = new_dict.loc[new_dict['Match'] == game]
    xT_prev_2min_home = []
    xT_prev_2min_away = []
    #for each row in each match:
    #create dataframes with data for only the time windows
    #create a dataframe with the xt data for the sliding time windows
    for i in range(len(match)):
        if i <= 4:
            df = match.iloc[0:i]
            xT_prev_2min_home.append(df['Added_xT_home_team'].mean())
            xT_prev_2min_away.append(df['Added_xT_away_team'].mean())
        else:
            df = match.iloc[i-4:i]
            xT_prev_2min_home.append(df['Added_xT_home_team'].mean())
            xT_prev_2min_away.append(df['Added_xT_away_team'].mean())
    
    match['xT_prev_2min_home'] = xT_prev_2min_home
    match['xT_prev_2min_away'] = xT_prev_2min_away
    final_df = pd.concat([final_df, match])
    
#create final pickle file with the xT files for all sliding windows
pickle.dump(final_df, open('all matches sliding windows xT.p', 'wb'))            