In [6]:
import pandas as pd 
import numpy as np
import os

In [7]:
rawData_dir = r"../Data"
rawData_paths = os.listdir(rawData_dir)
rawData_paths = [rawData_dir+'/'+path for path in rawData_paths]

In [85]:
yearly_data = []
for path in rawData_paths:
    data = pd.read_excel(path)
    yearly_data.append(data)

overall_data = pd.concat(yearly_data)

In [86]:
irrelevant_cols = ['ATP', 'Location', 'Series', 'Court', 'Round', 'Comment', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW', 'PSL', 'SJW', 'SJL', 'MaxW', 'MaxL', 'AvgW', 'AvgL']
overall_data.drop(columns=irrelevant_cols,inplace=True)

In [147]:
# Initialize player metrics

player_metrics = {
    "career_wins" : {},
    "career_loses" : {},
    "surface_wins" : {},
    "surface_loses" : {},
    "bestOf_wins" : {},
    "bestOf_loses" : {},
    "tournament_wins" : {},
    "tournament_loses" : {},
    "tb_wins": {},
    "tb_loses": {},
    "decider_wins": {},
    "decider_loses": {},
    "prev_match_date" : {},
    "prev_match_length" : {},
}

h2h = {}

In [148]:
final_data = pd.DataFrame(columns = ["Name1","Rank1","Wins1","Losses1","surfaceWins1","surfaceLosses1","bestOfWins1","bestOfLosses1","tournamentWins1","tournamentLosses1","Gap1","Length1","tbWins1","tbLoses1","deciderWins1","deciderLosses1","Odds1","Name2","Rank2","Wins2","Losses2","surfaceWins2","surfaceLosses2","bestOfWins2","bestOfLosses2","tournamentWins2","tournamentLosses2","Gap2","Length2","tbWins2","tbLoses2","deciderWins2","deciderLosses2","Odds2","h2h1","h2h2","Upset"])

num_matches = overall_data.shape[0]
for i in range(num_matches):
    # Get & Update player metrics
    curr_match = dict(overall_data.iloc[i])

    surface = curr_match["Surface"]
    bestOf = curr_match["Best of"]
    tournament = curr_match["Tournament"]
    date = curr_match["Date"]
    length = curr_match["WPts"]+curr_match["LPts"]
    w_name = curr_match["Winner"]
    w_rank = curr_match["WRank"]
    w_odds = curr_match["B365W"]

    w_wins = 0
    w_loses = 0
    w_surface_wins = 0
    w_surface_loses = 0
    w_bestOf_wins = 0
    w_bestOf_loses = 0
    w_tournament_wins = 0
    w_tournament_loses = 0
    if(w_name in player_metrics["career_wins"].keys()):
        w_wins = player_metrics["career_wins"][w_name]
        w_loses = player_metrics["career_loses"][w_name]
        player_metrics["career_wins"][w_name] = w_wins+1

        if(surface in player_metrics["surface_wins"][w_name].keys()):
            w_surface_wins = player_metrics["surface_wins"][w_name][surface]
            w_surface_loses = player_metrics["surface_loses"][w_name][surface]
            player_metrics["surface_wins"][w_name][surface] = w_surface_wins + 1
        else:
            player_metrics["surface_wins"][w_name][surface] = 1
            player_metrics["surface_loses"][w_name][surface] = 0

        if(bestOf in player_metrics["bestOf_wins"][w_name].keys()):
            w_bestOf_wins = player_metrics["bestOf_wins"][w_name][bestOf]
            w_bestOf_loses = player_metrics["bestOf_loses"][w_name][bestOf]
            player_metrics["bestOf_wins"][w_name][bestOf] = w_bestOf_wins + 1
        else:
            player_metrics["bestOf_wins"][w_name][bestOf] = 1
            player_metrics["bestOf_loses"][w_name][bestOf] = 0
        
        if(tournament in player_metrics["tournament_wins"][w_name].keys()):
            w_tournament_wins = player_metrics["tournament_wins"][w_name][tournament]
            w_tournament_loses = player_metrics["tournament_loses"][w_name][tournament]
            player_metrics["tournament_wins"][w_name][tournament] = w_tournament_wins + 1
        else:
            player_metrics["tournament_wins"][w_name][tournament] = 1
            player_metrics["tournament_loses"][w_name][tournament] = 0
    else:
        player_metrics["career_wins"][w_name] = 1
        player_metrics["career_loses"][w_name] = 0
        player_metrics["surface_wins"][w_name] = {surface : 1}
        player_metrics["surface_loses"][w_name] = {surface: 0}
        player_metrics["bestOf_wins"][w_name] = {bestOf : 1}
        player_metrics["bestOf_loses"][w_name] = {bestOf: 0}
        player_metrics["tournament_wins"][w_name] = {tournament: 1}
        player_metrics["tournament_loses"][w_name] = {tournament: 0}

    w_gap = 1e3
    w_length = 0
    if(w_name in player_metrics["prev_match_date"].keys()):
        w_gap = (date - player_metrics["prev_match_date"][w_name]).days
        w_length = player_metrics["prev_match_length"][w_name]
    player_metrics["prev_match_date"][w_name] = date
    player_metrics["prev_match_length"][w_name] = length

    w_tb_wins = 0
    w_tb_loses = 0
    if(w_name in player_metrics["tb_wins"].keys()):
        w_tb_wins = player_metrics["tb_wins"][w_name]
        w_tb_loses = player_metrics["tb_loses"][w_name]
    else:
        player_metrics["tb_wins"][w_name] = 0
        player_metrics["tb_loses"][w_name] = 0

    for i in range(1,6):
        score1 = curr_match["W"+str(i)]
        score2 = curr_match["L"+str(i)]
        if(score1==7):
            player_metrics["tb_wins"][w_name] += 1
        elif(score2==7):
            player_metrics["tb_loses"][w_name] += 1
        else:
            continue

    w_decider_wins = 0
    w_decider_loses = 0
    if(w_name in player_metrics["decider_wins"].keys()):
        w_decider_wins = player_metrics["decider_wins"][w_name]
        w_decider_loses = player_metrics["decider_loses"][w_name]
    else:
        player_metrics["decider_wins"][w_name] = 0
        player_metrics["decider_loses"][w_name] = 0

    if(curr_match["W5"]>0):
        player_metrics["decider_wins"][w_name] = w_decider_wins + 1

    winner_features = [w_name,w_rank,w_wins,w_loses,w_surface_wins,w_surface_loses,w_bestOf_wins,w_bestOf_loses,w_tournament_wins,w_tournament_loses,w_gap,w_length,w_tb_wins,w_tb_loses,w_decider_wins,w_decider_loses,w_odds]


    l_name = curr_match["Loser"]
    l_rank = curr_match["LRank"]
    l_odds = curr_match["B365L"]

    l_wins = 0
    l_loses = 0
    l_surface_wins = 0
    l_surface_loses = 0
    l_bestOf_wins = 0
    l_bestOf_loses = 0
    l_tournament_wins = 0
    l_tournament_loses = 0
    if(l_name in player_metrics["career_wins"].keys()):
        l_wins = player_metrics["career_wins"][l_name]
        l_loses = player_metrics["career_loses"][l_name]
        player_metrics["career_loses"][l_name] = l_loses+1

        if(surface in player_metrics["surface_wins"][l_name].keys()):
            l_surface_wins = player_metrics["surface_wins"][l_name][surface]
            l_surface_loses = player_metrics["surface_loses"][l_name][surface]
            player_metrics["surface_loses"][l_name][surface] = l_surface_loses + 1
        else:
            player_metrics["surface_wins"][l_name][surface] = 0
            player_metrics["surface_loses"][l_name][surface] = 1

        if(bestOf in player_metrics["bestOf_wins"][l_name].keys()):
            l_bestOf_wins = player_metrics["bestOf_wins"][l_name][bestOf]
            l_bestOf_loses = player_metrics["bestOf_loses"][l_name][bestOf]
            player_metrics["bestOf_loses"][l_name][bestOf] = l_bestOf_loses + 1
        else:
            player_metrics["bestOf_wins"][l_name][bestOf] = 0
            player_metrics["bestOf_loses"][l_name][bestOf] = 1
        
        if(tournament in player_metrics["tournament_wins"][l_name].keys()):
            l_tournament_wins = player_metrics["tournament_wins"][l_name][tournament]
            l_tournament_loses = player_metrics["tournament_loses"][l_name][tournament]
            player_metrics["tournament_loses"][l_name][tournament] = l_tournament_loses + 1
        else:
            player_metrics["tournament_wins"][l_name][tournament] = 0
            player_metrics["tournament_loses"][l_name][tournament] = 1
    else:
        player_metrics["career_wins"][l_name] = 0
        player_metrics["career_loses"][l_name] = 1
        player_metrics["surface_wins"][l_name] = {surface : 0}
        player_metrics["surface_loses"][l_name] = {surface: 1}
        player_metrics["bestOf_wins"][l_name] = {bestOf : 0}
        player_metrics["bestOf_loses"][l_name] = {bestOf: 1}
        player_metrics["tournament_wins"][l_name] = {tournament: 0}
        player_metrics["tournament_loses"][l_name] = {tournament: 1}

    l_gap = 1e3
    l_length = 0
    if(l_name in player_metrics["prev_match_date"].keys()):
        l_gap = (date - player_metrics["prev_match_date"][l_name]).days
        l_length = player_metrics["prev_match_length"][l_name]
    player_metrics["prev_match_date"][l_name] = date
    player_metrics["prev_match_length"][l_name] = length

    l_tb_wins = 0
    l_tb_loses = 0
    if(l_name in player_metrics["tb_wins"].keys()):
        l_tb_wins = player_metrics["tb_wins"][l_name]
        l_tb_loses = player_metrics["tb_loses"][l_name]
    else:
        player_metrics["tb_wins"][l_name] = 0
        player_metrics["tb_loses"][l_name] = 0

    for i in range(1,6):
        score1 = curr_match["W"+str(i)]
        score2 = curr_match["L"+str(i)]
        if(score2==7):
            player_metrics["tb_wins"][l_name] += 1
        elif(score1==7):
            player_metrics["tb_loses"][l_name] += 1
        else:
            continue

    l_decider_wins = 0
    l_decider_loses = 0
    if(l_name in player_metrics["decider_wins"].keys()):
        l_decider_wins = player_metrics["decider_wins"][l_name]
        l_decider_loses = player_metrics["decider_loses"][l_name]
    else:
        player_metrics["decider_wins"][l_name] = 0
        player_metrics["decider_loses"][l_name] = 0

    if(curr_match["W5"]>0):
        player_metrics["decider_loses"][l_name] = w_decider_loses + 1

    loser_features = [l_name,l_rank,l_wins,l_loses,l_surface_wins,l_surface_loses,l_bestOf_wins,l_bestOf_loses,l_tournament_wins,l_tournament_loses,l_gap,l_length,l_tb_wins,l_tb_loses,l_decider_wins,l_decider_loses,l_odds]

    w_h2h_wins = 0
    if(w_name in h2h.keys()):
        if(l_name in h2h[w_name].keys()):
            w_h2h_wins = h2h[w_name][l_name]
    else:
        h2h[w_name] = {}

    l_h2h_wins = 0
    if(l_name in h2h.keys()):
        if(w_name in h2h[l_name].keys()):
            l_h2h_wins = h2h[l_name][w_name]
    else:
        h2h[l_name] = {}

    h2h[w_name][l_name] = w_h2h_wins + 1


    upset = 1.0*(w_rank>l_rank)

    next_row = []
    if(upset):
        h2h_features = [l_h2h_wins,w_h2h_wins]
        next_row = loser_features + winner_features + h2h_features
    else:
        h2h_features = [w_h2h_wins,l_h2h_wins]
        next_row = winner_features + loser_features + h2h_features
    next_row.append(upset)

    final_data.loc[final_data.shape[0]] = next_row

In [167]:
final_data_v2 = final_data.iloc[7500:]
final_data_v2 = final_data_v2.reset_index()
final_data_v2.drop(columns=["index"],inplace=True)

In [169]:
final_data_v2.to_csv("../Results/preprocessed_data.csv",index=None)