In [1]:
#AUTHOR: TJAN ENG GER, KEVIN
#LAST UPDATED: 21st OCT 2023
import pandas as pd
#pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from joblib import Parallel, delayed
from dateutil.relativedelta import relativedelta
import warnings
warnings.simplefilter("ignore")

################################################################################################################################################################
# METHODS BELOW ARE FOR GENERATING PCSP
################################################################################################################################################################

# generate pcsp file
def generate_pcsp(params, date, ply1_name, ply2_name, hand1, hand2):
    '''
    Generates the PCSP file.
    
    Parameters
    ----------
    params: Python List
        List of parameters to define in the PCSP file.
        
    date: string
        Date of the match to simulate with, using the PCSP file
    
    p1y1_name: string
        Player 1's name
    
    ply2_name: string
        Player 2's name
        
    hand1: string
        Player 1's handness (LH - Left handed, RH - Right handed)
        
    hand2: string
        Player 2's handness (LH - Left handed, RH - Right handed)

    Returns
    -------
    Nothing. The PCSP file is created in the same directory as Generate_PCSP.py
    
    Notes
    -----
    The parameters here are in truth, the exact counts for each event happening under specific circumstances. 
    These counts will be normalised into probabilities when passed into PAT.
    '''
    VAR = 'var.txt'
    HAND = '%s_%s.txt' % (hand1, hand2)
    file_name = 'pcsp2/%s_%s_' % (hand1, hand2)
    file_name += '%s_%s_%s.pcsp' % (date, ply1_name.replace(' ', '-'), ply2_name.replace(' ', '-'))
    # write to file
    lines = []
    with open(VAR) as f:
        lines_1 = f.readlines()
    lines_2 = []
    for i, p in enumerate(params):
        lines_2.append('#define p%d %d;\n' % (i, p))
    with open(HAND) as f:
        lines_3 = f.readlines()
    lines = lines_1 + lines_2 + lines_3
    with open(file_name, 'w') as f:
        for line in lines:
            f.write(line)


def get_params(df, hand):
    '''
    Obtains a list of parameters to define for the PCSP file.
    This is a list of parameters with respect to player 1.
    
    Parameters
    ----------
    df: pandas.DataFrame
        The data to obtain parameters from.
        
    hand: string
        Player's handness (LH - Left handed, RH - Right handed)
        
    Returns
    -------
    A list of parameters.
    
    Notes
    -----
    The list generated is of this structure (updated 21st Oct 2023):
    First 60 integers: Serves from this player.
    In these 60 integers, divide them into sections of 15 integers.
    In each 15, the first 5 are the original number of a particualr serve type observed.
    The middle 5 is the number of that same serve type, except this player won the previous point.
    The last 5 is the number of that same serve type, except this player lost the previous point.
    
    Similarly, the next 66 integers will model the returns from this player. 
    The first 15 integers are for De_ForeHandR, De_ForeHandR_w, De_ForeHandR_l
    Then next 18 are for Ad_ForeHandR, Ad_ForeHandR_w, Ad_ForeHandR_l
    The next 15 are for De_BackHandR, De_BackHandR_w, De_BackHandR_l
    The last 18 are for Ad_BackHandR, Ad_BackHandR_w, Ad_BackHandR_l
    
    Lastly, the last 72 integers are for modelling the strokes of this player.
    The first 24 integers are for De_Stroke, De_Stroke_w, De_Stroke_l
    The nest 24 are for Mid_Stroke, Mid_Stroke_w, Mid_Stroke_l
    The last 24 are for Ad_Stroke, Ad_Stroke_w, Ad_Stroke_l
    
    In total, there are 198 integers in this list.
    
    Additional note: Ad_ForeHand = Ad_ForeHandR_w + Ad_ForeHandR_l.
    This is the same for all serves, returns and strokes.
    '''
    shotType = [df[df['shot_type'] == i] for i in range(5)]
    
    # Serve
    De_Serve = shotType[1].query('from_which_court==1')
    De_Serve_2nd = shotType[2].query('from_which_court==1')
    Ad_Serve = shotType[1].query('from_which_court==3')
    Ad_Serve_2nd = shotType[2].query('from_which_court==3')
    
    # Serve after just winning
    De_Serve_w = De_Serve.query('ply1_previous_point==True')
    De_Serve_2nd_w = De_Serve_2nd.query('ply1_previous_point==True')
    Ad_Serve_w = Ad_Serve.query('ply1_previous_point==True')
    Ad_Serve_2nd_w = Ad_Serve_2nd.query('ply1_previous_point==True')
    
    # Serve after just losing
    De_Serve_l = De_Serve.query('ply1_previous_point==False')
    De_Serve_2nd_l = De_Serve_2nd.query('ply1_previous_point==False')
    Ad_Serve_l = Ad_Serve.query('ply1_previous_point==False')
    Ad_Serve_2nd_l = Ad_Serve_2nd.query('ply1_previous_point==False')
    
    # Return
    De_ForeHandR = shotType[3].query('prev_shot_from_which_court==1 and shot<=20')
    Ad_ForeHandR = shotType[3].query('prev_shot_from_which_court==3 and shot<=20')
    De_BackHandR = shotType[3].query('prev_shot_from_which_court==1 and shot<=40 and shot>20')
    Ad_BackHandR = shotType[3].query('prev_shot_from_which_court==3 and shot<=40 and shot>20')
    
    # Return after just winning
    De_ForeHandR_w = De_ForeHandR.query('ply1_previous_point==True')
    Ad_ForeHandR_w = Ad_ForeHandR.query('ply1_previous_point==True')
    De_BackHandR_w = De_BackHandR.query('ply1_previous_point==True')
    Ad_BackHandR_w = Ad_BackHandR.query('ply1_previous_point==True')
    
    # Return after just losing
    De_ForeHandR_l = De_ForeHandR.query('ply1_previous_point==False')
    Ad_ForeHandR_l = Ad_ForeHandR.query('ply1_previous_point==False')
    De_BackHandR_l = De_BackHandR.query('ply1_previous_point==False')
    Ad_BackHandR_l = Ad_BackHandR.query('ply1_previous_point==False')
    
    # Stroke
    De_Stroke = shotType[4].query('from_which_court==1')
    Mid_Stroke = shotType[4].query('from_which_court==2')
    Ad_Stroke = shotType[4].query('from_which_court==3')

    # Stroke after just winning
    De_Stroke_w = De_Stroke.query('ply1_previous_point==True')
    Mid_Stroke_w = Mid_Stroke.query('ply1_previous_point==True')
    Ad_Stroke_w = Ad_Stroke.query('ply1_previous_point==True')
    
    # Stroke after just losing
    De_Stroke_l = De_Stroke.query('ply1_previous_point==False')
    Mid_Stroke_l = Mid_Stroke.query('ply1_previous_point==False')
    Ad_Stroke_l = Ad_Stroke.query('ply1_previous_point==False')
    
    results = []
    # Serve
    for Serve in [De_Serve, De_Serve_w, De_Serve_l, De_Serve_2nd, De_Serve_2nd_w, De_Serve_2nd_l, Ad_Serve, Ad_Serve_w, Ad_Serve_l, Ad_Serve_2nd, Ad_Serve_2nd_w, Ad_Serve_2nd_l]:
        ServeT = Serve.query('direction==6')
        ServeB = Serve.query('direction==5')
        ServeW = Serve.query('direction==4')
        serve_in = [len(x.query('shot_outcome==7')) for x in [ServeT, ServeB, ServeW]]
        serve_win = [len(Serve.query('shot_outcome in [1, 5, 6]'))]
        serve_err = [len(Serve.query('shot_outcome in [2, 3, 4]'))]
        results.append(serve_in + serve_win + serve_err)
    
    
    
    # Return
    if hand == 'RH':  # RH
        directions = [[[[1], [1]], [[1], [3]], [[1], [2]]],                    # FH_[CC, DL, DM]
                      [[[1], [1]], [[1], [3]], [[1], [2]]],                    # FH_[CC, DL, DM]
                      [[[1], [1]], [[1], [3]], [[1], [2]]],                    # FH_[CC, DL, DM]
                      [[[2, 3], [3]], [[3], [1]], [[2], [1]], [[2, 3], [2]]],  # FH_[IO, II, CC, DM]
                      [[[2, 3], [3]], [[3], [1]], [[2], [1]], [[2, 3], [2]]],  # FH_[IO, II, CC, DM]
                      [[[2, 3], [3]], [[3], [1]], [[2], [1]], [[2, 3], [2]]],  # FH_[IO, II, CC, DM]
                      [[[2], [3]], [[1], [3]], [[1, 2], [1]], [[1, 2], [2]]],  # BH_[CC, II, IO, DM]
                      [[[2], [3]], [[1], [3]], [[1, 2], [1]], [[1, 2], [2]]],  # BH_[CC, II, IO, DM]
                      [[[2], [3]], [[1], [3]], [[1, 2], [1]], [[1, 2], [2]]],  # BH_[CC, II, IO, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]],                    # BH_[CC, DL, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]],                    # BH_[CC, DL, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]]]                    # BH_[CC, DL, DM]
    else:  # LH
        directions = [[[[1, 2], [1]], [[1], [3]], [[2], [3]], [[1, 2], [2]]],  # FH_[IO, II, CC, DM]
                      [[[1, 2], [1]], [[1], [3]], [[2], [3]], [[1, 2], [2]]],  # FH_[IO, II, CC, DM]
                      [[[1, 2], [1]], [[1], [3]], [[2], [3]], [[1, 2], [2]]],  # FH_[IO, II, CC, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]],                    # FH_[CC, DL, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]],                    # FH_[CC, DL, DM]
                      [[[3], [3]], [[3], [1]], [[3], [2]]],                    # FH_[CC, DL, DM]
                      [[[1], [1]], [[1], [3]], [[1], [2]]],                    # BH_[CC, DL, DM]
                      [[[1], [1]], [[1], [3]], [[1], [2]]],                    # BH_[CC, DL, DM]
                      [[[1], [1]], [[1], [3]], [[1], [2]]],                    # BH_[CC, DL, DM]
                      [[[2], [1]], [[3], [1]], [[2, 3], [3]], [[2, 3], [2]]],  # BH_[CC, II, IO, DM]
                      [[[2], [1]], [[3], [1]], [[2, 3], [3]], [[2, 3], [2]]],  # BH_[CC, II, IO, DM]
                      [[[2], [1]], [[3], [1]], [[2, 3], [3]], [[2, 3], [2]]]]  # BH_[CC, II, IO, DM]
    for i, Return in enumerate([De_ForeHandR, De_ForeHandR_w, De_ForeHandR_l, Ad_ForeHandR, Ad_ForeHandR_w, Ad_ForeHandR_l, De_BackHandR, De_BackHandR_w, De_BackHandR_l, Ad_BackHandR, Ad_BackHandR_w, Ad_BackHandR_l]):
        shots = [Return.query('from_which_court in @dir[0] and to_which_court in @dir[1]') for dir in directions[i]]
        return_in = [len(x.query('shot_outcome==7')) for x in shots]
        return_win = [len(Return.query('shot_outcome in [1, 5, 6]'))]
        return_err = [len(Return.query('shot_outcome in [2, 3, 4]'))]
        results.append(return_in + return_win + return_err)

    # Rally
    if hand == 'RH':  # RH
        directions = [[[1, 3, 2], [3, 1, 2]], # de - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
                      [[1, 3, 2], [3, 1, 2]], # de - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
                      [[1, 3, 2], [3, 1, 2]], # de - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
                      [[3, 1, 2], [1, 3, 2]], # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[3, 1, 2], [1, 3, 2]], # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[3, 1, 2], [1, 3, 2]], # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[3, 1, 2], [3, 1, 2]], # ad - FHIO, FHII, FHDM, BHCC, BHDL, BHDM
                      [[3, 1, 2], [3, 1, 2]], # ad - FHIO, FHII, FHDM, BHCC, BHDL, BHDM
                      [[3, 1, 2], [3, 1, 2]]] # ad - FHIO, FHII, FHDM, BHCC, BHDL, BHDM

    else:  # LH
        directions = [[[1, 3, 2], [1, 3, 2]],  # de - FHIO, FHII, FHDM, BHCC, BHDL, BHDM
                      [[1, 3, 2], [1, 3, 2]],  # de - FHIO, FHII, FHDM, BHCC, BHDL, BHDM
                      [[1, 3, 2], [1, 3, 2]],  # de - FHIO, FHII, FHDM, BHCC, BHDL, BHDM
                      [[1, 3, 2], [3, 1, 2]],  # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[1, 3, 2], [3, 1, 2]],  # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[1, 3, 2], [3, 1, 2]],  # mid - FHIO, FHCC, FHDM, BHIO, BHCC, BHDM
                      [[3, 1, 2], [1, 3, 2]],  # ad - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
                      [[3, 1, 2], [1, 3, 2]],  # ad - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
                      [[3, 1, 2], [1, 3, 2]]]  # ad - FHCC, FHDL, FHDM, BHII, BHIO, BHDM
    for i, Stroke in enumerate([De_Stroke, De_Stroke_w, De_Stroke_l, Mid_Stroke, Mid_Stroke_w, Mid_Stroke_l, Ad_Stroke, Ad_Stroke_w, Ad_Stroke_l]):
        FH_Stroke = Stroke.query('shot<=20')
        BH_Stroke = Stroke.query('shot<=40 and shot>20')
        FH_shots = [FH_Stroke.query('to_which_court==@to_dir') for to_dir in directions[i][0]]
        BH_shots = [BH_Stroke.query('to_which_court==@to_dir') for to_dir in directions[i][1]]
        shots = FH_shots + BH_shots
        FH_stroke_in = [len(x.query('shot_outcome==7')) for x in FH_shots]
        BH_stroke_in = [len(x.query('shot_outcome==7')) for x in BH_shots]
        stroke_win = [len(Stroke.query('shot_outcome in [1, 5, 6]'))]
        stroke_err = [len(Stroke.query('shot_outcome in [2, 3, 4]'))]
        results.append(FH_stroke_in + BH_stroke_in + stroke_win + stroke_err)

    return results


def generate_transition_probs(data, date, ply1_name, ply2_name, ply1_hand, ply2_hand, i=0):
    '''
    Generates the list of probabilites of each player's responses, and then generates a PCSP file using them.
    
    Parameters
    ----------
    data: pandas.DataFrame
        The data to generate the PCSP file with.
        
    date: string
        Date of the match.
        
    ply1_name: string
        Player 1's name.
        
    ply2_name: string
        Player 2's name.
        
    ply1_hand: string
        Player 1's handness (LH - Left handed, RH - Right handed)
        
    ply2_hand: string
        Player 2's handness (LH - Left handed, RH - Right handed)

    Returns
    -------
    Nothing. The PCSP file is created in the same directory as Generate_PCSP.py
    '''
    prev_date = (pd.to_datetime(date) - relativedelta(years=2)).strftime('%Y-%m-%d')
    
    # NEW: ANALYSE BASED ON PLAYER VS OTHER PLAYERS
    p1_opponents = all_matches(data, ply1_name, ply2_hand)
    p2_opponents = all_matches(data, ply2_name, ply1_hand)
    
    # number of opponents
    num_ply1_prev_n = len(p1_opponents)
    num_ply2_prev_n = len(p2_opponents)
    #print(num_ply1_prev_n == 0 or num_ply2_prev_n == 0)
    if (num_ply1_prev_n <= 10 or num_ply2_prev_n <= 10):
        print("Not enough data to generate PCSP file! Rejecting file", i)
        return
    
    data_ply1 = data.query('date>=@prev_date and date<@date and (ply1_name==@ply1_name or ply2_name==@ply1_name)')
    data_ply2 = data.query('date>=@prev_date and date<@date and (ply1_name==@ply2_name or ply2_name==@ply2_name)')

    
    ply1_params = analyse_player_behaviour(data_ply1, ply1_name, ply1_hand, p1_opponents)
    print(ply1_name, "done, File", i)
    ply2_params = analyse_player_behaviour(data_ply2, ply2_name, ply2_hand, p2_opponents)
    print(ply2_name, "done, File", i)
    # OLD
    #data_ply1 = data.query('date>=@prev_date and date<@date and ply1_name==@ply1_name and ply2_name==@ply2_name')
    #data_ply2 = data.query('date>=@prev_date and date<@date and ply1_name==@ply2_name and ply2_name==@ply1_name')

    # number of matches played
    #num_ply1_prev_n = len(data_ply1.date.unique())
    #num_ply2_prev_n = len(data_ply2.date.unique())

    # sample
    #print(ply1_params)
    #print(ply2_params)
    params = ply1_params + ply2_params
    #print(params)
    print("File", i, '# P1 matches:', num_ply1_prev_n, '# P2 matches:', num_ply2_prev_n)

    generate_pcsp(params, date, ply1_name, ply2_name, ply1_hand, ply2_hand)

################################################################################################################################################################
# METHODS BELOW ARE FOR ANALYSING THE DATA
################################################################################################################################################################

def filter_for_two_players(df, p1name, p2name):
    '''
    Filters the entire df such that the only rows left contain both p1name and p2name in columns 0 and 1.

    Parameters
    ----------
    df: pd.DataFrame
        The dataset.
        
    p1name: string
        Player 1's name
    
    p2name: string
        Player 2's name

    Returns
    -------
    The filtered dataframe.
    '''
    return df.loc[((df['ply1_name'] == p1name) & (df['ply2_name'] == p2name)) 
            | ((df['ply2_name'] == p1name) & (df['ply1_name'] == p2name))]
            
def standardize_p1_p2(df, p1name, p2name):
    '''
    Ensures that p1 is always p1 in our analysis. 
    WARNING: THIS SHOULD BE ONLY USED AS A TEMPORARY MEASURE IN TRYING TO ANALYSE RELATIONSHIPS BETWEEN ROUNDS. DO NOT USE OUTSIDE OF THAT PURPOSE.

    Parameters
    ----------
    df: pd.DataFrame
        Regular match data
        
    p1name: string
        Player 1's name
    
    p2name: string
        Player 2's name

    Returns
    -------
    Returns a dataframe where column 0 is only p1, and column 2 is also only p2.
    Additional column to indicate that the names have been swapped appended at the end.
    '''
    new_df = df.copy(deep=True)
    new_df.loc[df['ply1_name'] == p1name, 'Is incorrect orientation'] = False
    new_df.loc[df['ply1_name'] == p2name, 'Is incorrect orientation'] = True
    return new_df
    
def did_players_win(df):
    '''
    Identifies which player won the previous round, and which lost.

    Parameters
    ----------
    df: pd.DataFrame
        The potentially imbalanced dataset.
        
    p1name: string
        Player 1's name
    
    p2name: string
        Player 2's name

    Returns
    -------
    A list of lists of integers that indicate the victor of the previous round. 
    [0, 0] - No players have won yet
    [1, 0] - Player 1 won last point/set/game
    [0, 1] - Player 2 won last point/set/game
    '''
    win_tracker = []
    prev_row = pd.DataFrame()
    for _, row in df.iterrows():
        if (prev_row.empty):
            prev_row = row
            continue
        if ((not row['Is incorrect orientation']) and (not prev_row['Is incorrect orientation'])): # Player 1 is in correct position for both rows
            win_row = [0, 0]
            if (row['ply1_points'] - prev_row['ply1_points'] == 1): # If player 1 wins point
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_points'] - prev_row['ply2_points'] == 1): # If player 2 wins point
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply1_games'] - prev_row['ply1_games'] == 1): # If player 1 wins game
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_games'] - prev_row['ply2_games'] == 1): # If player 2 wins game
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply1_sets'] - prev_row['ply1_sets'] == 1): # If player 1 wins set
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_sets'] - prev_row['ply2_sets'] == 1): # If player 2 wins set
                win_row[0] = 0
                win_row[1] = 1
            win_tracker.append(win_row)
            
        elif ((not row['Is incorrect orientation']) and (prev_row['Is incorrect orientation'])): # Player 1 is in incorrect position in prev row
            win_row = [0, 0]
            if (row['ply1_points'] - prev_row['ply2_points'] == 1): # If player 1 wins point
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_points'] - prev_row['ply1_points'] == 1): # If player 2 wins point
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply1_games'] - prev_row['ply2_games'] == 1): # If player 1 wins game
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_games'] - prev_row['ply1_games'] == 1): # If player 2 wins game
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply1_sets'] - prev_row['ply2_sets'] == 1): # If player 1 wins set
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply2_sets'] - prev_row['ply1_sets'] == 1): # If player 2 wins set
                win_row[0] = 0
                win_row[1] = 1
            win_tracker.append(win_row)
            
        elif (row['Is incorrect orientation'] and (not prev_row['Is incorrect orientation'])): # Player 1 is in the incorrect position in this row
            win_row = [0, 0]
            if (row['ply2_points'] - prev_row['ply1_points'] == 1): # If player 1 wins point
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply1_points'] - prev_row['ply2_points'] == 1): # If player 2 wins point
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply2_games'] - prev_row['ply1_games'] == 1): # If player 1 wins game
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply1_games'] - prev_row['ply2_games'] == 1): # If player 2 wins game
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply2_sets'] - prev_row['ply1_sets'] == 1): # If player 1 wins set
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply1_sets'] - prev_row['ply2_sets'] == 1): # If player 2 wins set
                win_row[0] = 0
                win_row[1] = 1
            win_tracker.append(win_row)
            
        elif (row['Is incorrect orientation'] and prev_row['Is incorrect orientation']): # Both rows are in incorrect position
            win_row = [0, 0]
            if (row['ply1_points'] - prev_row['ply1_points'] == 1): # If player 2 wins point
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply2_points'] - prev_row['ply2_points'] == 1): # If player 1 wins point
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply1_games'] - prev_row['ply1_games'] == 1): # If player 2 wins game
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply2_games'] - prev_row['ply1_games'] == 1): # If player 1 wins game
                win_row[0] = 1
                win_row[1] = 0
            elif (row['ply1_sets'] - prev_row['ply1_sets'] == 1): # If player 2 wins set
                win_row[0] = 0
                win_row[1] = 1
            elif (row['ply2_sets'] - prev_row['ply2_sets'] == 1): # If player 1 wins set
                win_row[0] = 1
                win_row[1] = 0
            win_tracker.append(win_row)
            
        prev_row = row
        
        if (prev_row['url'] != row['url']): # This means that there is another separate tennis match in the df, so reset
            prev_row = None
    return win_tracker
    
def search_players(df, p1name, p2name):
    '''
    Searches the dataframe <df> to look for all matchups of <p1name> vs <p2name>.
    Also adds 2 boolean columns to track who among the 2 of them won the previous point.

    Parameters
    ----------
    df: pd.DataFrame
        The potentially imbalanced dataset.
        
    p1name: string
        Player 1's name
    
    p2name: string
        Player 2's name

    Returns
    -------
    Returns a dataframe that containts all matchups of <p1name> vs <p2name> that exists in <df>.
    Also adds 2 boolean columns to track who among the 2 of them won the previous point.
    '''
    df_vs = filter_for_two_players(df, p1name, p2name).copy(deep=True)
    if(df_vs.empty): return pd.DataFrame() # Input data may find opponent from outside the target time period in all_matches, so empty dataframe is possible! If found, ignore the request.
    df_vs_bools = standardize_p1_p2(df_vs, p1name, p2name)
    res = did_players_win(df_vs_bools)
    did_player1_win = [0]
    yes = 0 # 0 means NA
    for arr in res:
        if (arr == [1, 0]):
            yes = 1
        elif (arr == [0, 1]):
            yes = 2
        did_player1_win.append(yes)
    hey = df_vs_bools['Is incorrect orientation'].to_list()
    col1_to_add = [] # True if player 1 based on the original row won the last point
    col2_to_add = []
    for i in range(len(did_player1_win)):
        if (hey[i]): # If original was inverted
            if (did_player1_win[i] == 0):
                col1_to_add.append(False)
                col2_to_add.append(False)
            elif (did_player1_win[i] == 1):
                col1_to_add.append(False)
                col2_to_add.append(True)
            elif (did_player1_win[i] == 2):
                col1_to_add.append(True)
                col2_to_add.append(False)
        else:
            if (did_player1_win[i] == 0):
                col2_to_add.append(False)
                col1_to_add.append(False)
            elif (did_player1_win[i] == 1):
                col2_to_add.append(False)
                col1_to_add.append(True)
            elif (did_player1_win[i] == 2):
                col2_to_add.append(True)
                col1_to_add.append(False)
    df_vs['ply1_previous_point'] = col1_to_add
    df_vs['ply2_previous_point'] = col2_to_add
    return df_vs
    
def response_probabilities_of_one_player(player_name, shot_type, df):
    '''
    Given a dataframe about the matchup data of 2 players, and the shot <direction> they are responding to, 
    calculate the shot probabilites of how they would respond, given some conditions.
    Currently, conditions mean given that they previously won or lost a point.
    Also, this only checks probabilities for rallies (for now)

    Parameters
    ----------
    df: pd.DataFrame
        The potentially imbalanced dataset.
        
    shot_type: int
        The direction of the ball that they are responding to. To potentially change to shot type in the future.
        Alternatively, where is the ball from the opponent going to land.
    
    player_name: string
        The name of the player to analyse.

    Returns
    -------
    Currently returns 2 dataframes.
    The first contains probabilities of the responses to <shot_type>. 
    The second containst the probabilities of the responses to <shot_type>, but also splits them into if 
    <player_name> had scored the previous point, or if his opponent did.
    '''
    
    df_only_relevant_shots = df.loc[(df['ply1_name'] == player_name) & (df['From which court'] == shot_type) & (df['Shot Type'] == 4)]
    rating_probs = df_only_relevant_shots.groupby('To which court').size().div(len(df_only_relevant_shots))
    # rating_probs_w_win_loss = df_only_relevant_shots.groupby(['Did player 1 win previous point', 'To which court']).size().div(len(df_only_relevant_shots)).div(rating_probs, axis=0, level='To which court')
    rating_probs_w_win_loss = df_only_relevant_shots.groupby(['Did player 1 win previous point', 'To which court']).size().div(len(df_only_relevant_shots))
    return rating_probs, rating_probs_w_win_loss

def all_matches(df, player_name, opp_hand):
    #Searches df for all matches involving player_name, returns a list of all their opponents
    df_match = df.loc[(((df['ply1_name'] == player_name) & (df['ply2_hand'] == opp_hand)) | ((df['ply2_name'] == player_name) & (df['ply1_hand'] == opp_hand)))]
    names = df_match.ply2_name.unique()
    #print(names)
    names = np.delete(names, np.argwhere(names == player_name))
    return names

def analyse_player_behaviour(df, player_name, hand, opponent_list):
    #Returns a param list of their behaviours using get_params
    params = []
    print(player_name, "opponents:", len(opponent_list))
    for i in range(len(opponent_list)):
        if (i == len(opponent_list) / 2):
            print(player_name, "halfway:", (i + 1), "of", len(opponent_list))
        data = search_players(df, player_name, opponent_list[i])
        if (data.empty):
            continue
        #print(data)
        behaviours = get_params(data, hand)
        behaviours = sum(behaviours, [])
        #print(params, behaviours)
        #print("%d %d" % (len(params), len(behaviours)))
        if (len(params) == 0):
            params = behaviours
        else:
            params = list(map(sum, zip(params, behaviours)))
    return params

In [2]:
# obtain shot-by-shot data
file = './tennisabstract-v2-combined.csv'
init_data = pd.read_csv(file, names=['ply1_name', 'ply2_name', 'ply1_hand', 'ply2_hand', 'ply1_points',
                                'ply2_points', 'ply1_games', 'ply2_games', 'ply1_sets', 'ply2_sets', 'date',
                                'tournament_name', 'shot_type', 'from_which_court', 'shot', 'direction',
                                'to_which_court', 'depth', 'touched_net', 'hit_at_depth', 'approach_shot',
                                'shot_outcome', 'fault_type', 'prev_shot_type', 'prev_shot_from_which_court',
                                'prev_shot', 'prev_shot_direction', 'prev_shot_to_which_court', 'prev_shot_depth',
                                'prev_shot_touched_net', 'prev_shot_hit_at_depth', 'prev_shot_approach_shot',
                                'prev_shot_outcome', 'prev_shot_fault_type', 'prev_prev_shot_type',
                                'prev_prev_shot_from_which_court', 'prev_prev_shot', 'prev_prev_shot_direction',
                                'prev_prev_shot_to_which_court', 'prev_prev_shot_depth',
                                'prev_prev_shot_touched_net', 'prev_prev_shot_hit_at_depth',
                                'prev_prev_shot_approach_shot', 'prev_prev_shot_outcome',
                                'prev_prev_shot_fault_type', 'url', 'description'])
dp = init_data.query('date>="2016-01-01" and date < "2019-01-01"').copy(deep=True) #This just reduces the size of the df we use

In [3]:
files_generated = []
def process_row(num, row):
    print("File:", num, row['P1Name'], row['P2Name'], row['P1Hand'], row['P2Hand'], row['date'])
    generate_transition_probs(dp, row['date'], row['P1Name'], row['P2Name'], row['P1Hand'], row['P2Hand'], i=num)
    files_generated.append(num)
    print(files_generated)

In [4]:
matched_records = "MatchedRecords.csv"
matched_df = pd.read_csv(matched_records)
#TODO: Change the number of processes to run in parallel in n_jobs. 1 if sequential is desired.-2 for max processes possible with your computer - 1.
Parallel(n_jobs=-4)(delayed(process_row)(i, row) for i, row in matched_df.iterrows())

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,