In [1]:
import json
import pandas as pd

In [2]:
livesport = json.load(open('../data/raw_data/match_details_2024.json'))
matches = pd.read_csv("../data/Merged_year_data/matches.csv")
matches2023 = matches[matches["match_id"].str.split('_').str[1] == '2023']
bets2024 = pd.read_excel("../data/raw_data/2024.xlsx")

In [3]:
github_2023 = pd.read_csv("../data/Processed_data/github2023.csv")
github_2022 = pd.read_csv("../data/Processed_data/github2022.csv")
github_2021 = pd.read_csv("../data/Processed_data/github2021.csv")
github_2020 = pd.read_csv("../data/Processed_data/github2020.csv")
github_2019 = pd.read_csv("../data/Processed_data/github2019.csv")
github_2018 = pd.read_csv("../data/Processed_data/github2018.csv")
github_df = pd.concat([github_2023, github_2022, github_2021, github_2020, github_2019, github_2018], ignore_index=True)  

In [4]:
manual_mapping = {
    'Varillas J. P.': 'Varillas J.P.',
    'Tseng C. H.': 'Tseng C.H.',
}
bets2024['Winner'] = bets2024['Winner'].replace(manual_mapping)
bets2024['Loser'] = bets2024['Loser'].replace(manual_mapping)

In [8]:
livesport

{'https://www.livesport.com/en/match/hrB0HLYB/#/match-summary': {'match_summary': {'tournament_info': {'tournament_name': 'BRISBANE',
    'location': 'AUSTRALIA',
    'surface': 'HARD',
    'court': 'outdoor',
    'round_name': 'FINAL'},
   'time_and_date': {'date': '07.01.2024', 'time': '07:55'},
   'comment': 'FINISHED',
   'player1': {'name': 'Rune H.',
    'rank': '13',
    'is_winner': False,
    'seed': '1'},
   'player2': {'name': 'Dimitrov G.',
    'rank': '10',
    'is_winner': True,
    'seed': '2'},
   'sets_score': {'Wsets': 0, 'Lsets': 2},
   'scores_by_set': {'player1': {'Set 1': '6\n5', 'Set 2': '4'},
    'player2': {'Set 1': '7\n7', 'Set 2': '6'}},
   'duration': '2:19'},
  'match_statistics': {'Aces': ['9', '8'],
   'Double Faults': ['3', '2'],
   '1st Serve Percentage': ['61%', '70%'],
   '1st Serve Points Won': ['76% (44/58)', '77% (40/52)'],
   '2nd Serve Points Won': ['43% (16/37)', '59% (13/22)'],
   'Break Points Saved': ['89% (8/9)', '100% (3/3)'],
   '1st Retur

In [9]:
def transform_data_to_dataframe(data):
    rows = []
    for url, match_data in data.items():
        summary = match_data['match_summary']
        print(summary)
        if summary["tournament_info"]["tournament_name"] != "OLYMPIC GAMES":
            stats = match_data.get('match_statistics', {})
            
            # Determine winner and loser based on is_winner flag
            if summary['player1']['is_winner']:
                winner, loser = summary['player1'], summary['player2']
            else:
                winner, loser = summary['player2'], summary['player1']
            
            # Base row for all cases
            row = {
                "url": url,
                "tournament_name": summary["tournament_info"]["tournament_name"],
                "location": summary["tournament_info"]["location"],
                "surface": summary["tournament_info"]["surface"],
                "round_name": summary["tournament_info"]["round_name"],
                "date": summary["time_and_date"]["date"],
                "time": summary["time_and_date"]["time"],
                "comment": summary["comment"],
                "winner_name": winner["name"],
                "winner_rank": winner["rank"],
                "winner_seed": winner.get("seed", ""),
                "loser_name": loser["name"],
                "loser_rank": loser["rank"],
                "loser_seed": loser.get("seed", "")
            }
            
            # For WALKOVER, skip additional stats and set others to empty
            if summary["comment"] == "WALKOVER" or summary["tournament_info"]["tournament_name"] == "OLYMPIC GAMES":
                # Set remaining fields to empty
                row.update({
                    "winner_sets": "",
                    "loser_sets": "",
                    "duration": ""
                })
            else:
                # Add regular match-specific fields
                row.update({
                    "winner_sets": summary["sets_score"]["Wsets"],
                    "loser_sets": summary["sets_score"]["Lsets"],
                    "duration": summary["duration"]
                })
                
                # Add set scores
                winner_scores = summary["scores_by_set"].get("player1" if summary['player1']['is_winner'] else "player2", {})
                loser_scores = summary["scores_by_set"].get("player2" if summary['player1']['is_winner'] else "player1", {})
                
                for set_num, score in winner_scores.items():
                    row[f"winner_{set_num}"] = score
                for set_num, score in loser_scores.items():
                    row[f"loser_{set_num}"] = score
                
                # Add match statistics
                if summary['player1']['is_winner']:
                    for stat_name, values in stats.items():
                        row[f"winner_{stat_name}"] = values[0]
                        row[f"loser_{stat_name}"] = values[1]
                else:
                    for stat_name, values in stats.items():
                        row[f"winner_{stat_name}"] = values[1]
                        row[f"loser_{stat_name}"] = values[0]
            
            rows.append(row)
    
    # Create a DataFrame
    return pd.DataFrame(rows)

# Example usage
df = transform_data_to_dataframe(livesport)
df

{'tournament_info': {'tournament_name': 'BRISBANE', 'location': 'AUSTRALIA', 'surface': 'HARD', 'court': 'outdoor', 'round_name': 'FINAL'}, 'time_and_date': {'date': '07.01.2024', 'time': '07:55'}, 'comment': 'FINISHED', 'player1': {'name': 'Rune H.', 'rank': '13', 'is_winner': False, 'seed': '1'}, 'player2': {'name': 'Dimitrov G.', 'rank': '10', 'is_winner': True, 'seed': '2'}, 'sets_score': {'Wsets': 0, 'Lsets': 2}, 'scores_by_set': {'player1': {'Set 1': '6\n5', 'Set 2': '4'}, 'player2': {'Set 1': '7\n7', 'Set 2': '6'}}, 'duration': '2:19'}
{'tournament_info': {'tournament_name': 'BRISBANE', 'location': 'AUSTRALIA', 'surface': 'HARD', 'court': 'outdoor', 'round_name': 'SEMI-FINALS'}, 'time_and_date': {'date': '06.01.2024', 'time': '11:40'}, 'comment': 'FINISHED', 'player1': {'name': 'Thompson J.', 'rank': '26', 'is_winner': False, 'seed': ''}, 'player2': {'name': 'Dimitrov G.', 'rank': '10', 'is_winner': True, 'seed': '2'}, 'sets_score': {'Wsets': 0, 'Lsets': 2}, 'scores_by_set': {'p

KeyError: 'tournament_name'

In [None]:
 # Convert column values to sets
set1 = set(matches["tournament_location"].str.lower())
set2 = set(df["tournament_name"].str.lower())
print(set2)
# Find unique values
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

In [6]:
# Define a mapping for mismatched values
df['tournament_name'] = df['tournament_name'].str.lower()

mapping = {
    'hertogenbosch': "'s-hertogenbosch",
    'paris': 'paris 2',
    'london': 'queens club',
    'australian open': 'melbourne',
    'french open': 'paris',
    'wimbledon': 'london',
    'us open': 'new york',
}
df['tournament_name'] = df['tournament_name'].replace(mapping)

NameError: name 'df' is not defined

In [7]:
 # Convert column values to sets
set1 = set(matches["tournament_location"].str.lower())
set2 = set(matches2023["tournament_location"].str.lower())
set3 = set(df["tournament_name"].str.lower())
print(set2)
# Find unique values
unique_to_df1 = set2 - set3
unique_to_df2 = set3 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

NameError: name 'df' is not defined

In [9]:
df['winner_name'] = df['winner_name'].str.lower()
df['loser_name'] = df['loser_name'].str.lower()

 # Convert column values to sets
set1 = set(bets2024["Winner"].str.lower())
set2 = set(df["winner_name"].str.lower())
set3 = set(bets2024["Loser"].str.lower())
set4 = set(df["loser_name"].str.lower())

set1.update(set3)
set2.update(set4)


# Find unique values
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'tseng c.h.', 'kwon s.w.', 'huesler m.a.', 'carreno busta p.', 'bailly g.', 'hong s.', 'mpetshi g.', 'bautista agut r.', 'herbert p.h.', 'burruchaga r.', 'blanch d.', 'schwaerzler j.', 'etcheverry t.', 'rehberg m.', 'galan d.e.', 'barrios m.', 'o connell c.', 'struff j.l.', 'gomez f.', 'tirante t.a.', 'zhang zh.', 'varillas j.p.', 'cerundolo j.m.', 'ramos-vinolas a.'}

Values in only in df2:
{'cerundolo j. m.', 'barrios vera t.', 'tseng c. h.', 'kwon s.', 'bautista-agut r.', 'ramos a.', 'hong s. c.', 'rehberg m. h.', 'mpetshi perricard g.', 'zhang z.', 'carreno-busta p.', 'huesler m.', 'struff j-l.', 'burruchaga r. a.', 'galan d. e.', 'bailly g. a.', 'etcheverry t. m.', 'schwaerzler j. j.', 'gomez f. a.', 'tirante t. a.', 'blanch dar.', 'varillas j. p.', "o'connell c.", 'herbert p.'}


In [10]:
from fuzzywuzzy import process
fixed_names = {}
for name in set2:
    closest_match = process.extractOne(name, set1)
    if closest_match[0] != name and closest_match[1] > 90:  # Threshold for similarity
        print(name, closest_match[0])
        fixed_names[name] = closest_match[0]
    else:
        fixed_names[name] = name  # Keep original if no good match is found



cerundolo j. m. cerundolo j.m.
tseng c. h. tseng c.h.
carreno-busta p. carreno busta p.
galan d. e. galan d.e.
schwaerzler j. j. schwaerzler j.
gomez f. a. gomez f.
rehberg m. h. rehberg m.
huesler m. huesler m.a.
struff j-l. struff j.l.
o'connell c. o connell c.
burruchaga r. a. burruchaga r.
tirante t. a. tirante t.a.
bautista-agut r. bautista agut r.
zhang z. zhang zh.
bailly g. a. bailly g.
etcheverry t. m. etcheverry t.
herbert p. herbert p.h.
kwon s. kwon s.w.
varillas j. p. varillas j.p.


In [11]:

# Find unique values
unique_to_df1 = set1 - set(fixed_names.values())
unique_to_df2 = set(fixed_names.values()) - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'blanch d.', 'barrios m.', 'hong s.', 'mpetshi g.', 'ramos-vinolas a.'}

Values in only in df2:
{'mpetshi perricard g.', 'barrios vera t.', 'blanch dar.', 'ramos a.', 'hong s. c.'}


In [12]:
manual_mapping = {
    'hong s. c.': 'hong s.',
    'ramos a.': 'ramos-vinolas a.',
    'blanch dar.': 'blanch d.',
    'mpetshi perricard g.': 'mpetshi g.',
}
print(sorted(set(df['winner_name'].unique())))
df['winner_name'] = df['winner_name'].replace(fixed_names)
df['loser_name'] = df['loser_name'].replace(fixed_names)

df['winner_name'] = df['winner_name'].replace(manual_mapping)
df['loser_name'] = df['loser_name'].replace(manual_mapping)
manual_mapping = {
    'Barrios M.': 'Barrios Vera T.'
}
bets2024['Winner'] = bets2024['Winner'].replace(manual_mapping)
bets2024['Loser'] = bets2024['Loser'].replace(manual_mapping)


['ajdukovic d.', 'albot r.', 'alcaraz c.', 'altmaier d.', 'arnaldi m.', 'atmane t.', 'auger-aliassime f.', 'baez s.', 'bagnis f.', 'barrere g.', 'barrios vera t.', 'bautista-agut r.', 'bellucci m.', 'bergs z.', 'berrettini m.', 'blanchet u.', 'bolt a.', 'bonzi b.', 'borges n.', 'broady l.', 'brouwer g.', 'bu y.', 'bublik a.', 'burruchaga r. a.', 'cachin p.', 'carballes baena r.', 'carreno-busta p.', 'cazaux a.', 'cerundolo f.', 'cerundolo j. m.', 'choinski j.', 'cilic m.', 'cobolli f.', 'comesana f.', 'coria f.', 'coric b.', 'cressy m.', 'damm m.', 'daniel t.', 'darderi l.', 'davidovich fokina a.', 'de jong j.', 'de minaur a.', 'dellien h.', 'diallo g.', 'diaz acosta f.', 'dimitrov g.', 'djere l.', 'djokovic n.', 'dodig m.', 'draper j.', 'duckworth j.', 'engel j.', 'etcheverry t. m.', 'eubanks c.', 'evans d.', 'fearnley j.', 'fils a.', 'fognini f.', 'fonseca j.', 'fritz t.', 'fucsovics m.', 'galan d. e.', 'garin c.', 'gasquet r.', 'gaston h.', 'gigante m.', 'giron m.', 'goffin d.', 'go

In [13]:
 # Convert column values to sets
set1 = set(bets2024["Winner"].str.lower())
set2 = set(df["winner_name"].str.lower())
set3 = set(bets2024["Loser"].str.lower())
set4 = set(df["loser_name"].str.lower())
set1.update(set3)
set2.update(set4)


# Find unique values
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
set()
Values in only in df2:
set()


In [14]:
 # Convert column values to sets
set1 = set(bets2024["Location"].str.lower())
set2 = set(df["tournament_name"].str.lower())
# Find unique values
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'turin', 'belgrade', 'dubai '}
Values in only in df2:
{'belgrade 2', 'paris 2', 'dubai'}


In [15]:
bets2024.loc[bets2024['Tournament']== 'BNP Paribas Masters', 'Location'] ='Paris 2'
bets2024.loc[bets2024['Tournament']== 'Belgrade Open', 'Location'] ='Belgrade 2'
bets2024.loc[bets2024['Location']== 'Dubai ', 'Location'] ='Dubai'

In [16]:
bets2024 = bets2024[~bets2024['Tournament'].isin(['Masters Cup'])]

In [17]:
 # Convert column values to sets
set1 = set(bets2024["Location"].str.lower())
set2 = set(df["tournament_name"].str.lower())
# Find unique values
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
set()
Values in only in df2:
set()


In [18]:
import random

# Create the initial mapping for losers and winners
map_loser_name_id = github_df.groupby('shortened_loser_name', as_index=False)[['shortened_loser_name', 'loser_id']].first()
map_winner_name_id = github_df.groupby('shortened_winner_name', as_index=False)[['shortened_winner_name', 'winner_id']].first()

# Combine the mappings for both losers and winners
mapping = {**dict(zip(map_loser_name_id['shortened_loser_name'], map_loser_name_id['loser_id'])),
           **dict(zip(map_winner_name_id['shortened_winner_name'], map_winner_name_id['winner_id']))}

# Define a function to generate unique 6-digit IDs
existing_ids = set(mapping.values())  # Track existing IDs to avoid duplicates

def generate_or_get_id(player_name):
    # If the player already has an ID, return it
    if player_name in mapping:
        return mapping[player_name]
    # Otherwise, generate a new unique ID
    new_id = str(random.randint(100000, 999999))
    while new_id in existing_ids:
        new_id = str(random.randint(100000, 999999))
    # Save the new ID in the mapping and mark it as used
    mapping[player_name] = new_id
    existing_ids.add(new_id)
    return new_id

# Map IDs for both loser and winner columns
bets2024['loser_id'] = bets2024['Loser'].apply(generate_or_get_id)
bets2024['winner_id'] = bets2024['Winner'].apply(generate_or_get_id)

In [19]:
print(bets2024.loc[bets2024['loser_id'].isna(), 'Loser'].unique())

[]


In [20]:
name_mapping = {name.lower(): name for name in bets2024['Loser']}
df['loser_name'] = df['loser_name'].str.lower().map(name_mapping).fillna(df['loser_name'])
name_mapping = {name.lower(): name for name in bets2024['Winner']}
df['winner_name'] = df['winner_name'].str.lower().map(name_mapping).fillna(df['winner_name'])
tournament_mapping = {tournament.lower(): tournament for tournament in bets2024['Location']}
df['tournament_name'] = df['tournament_name'].str.lower().map(tournament_mapping).fillna(df['tournament_name'])

In [21]:
map_player_name_player_id = bets2024.groupby('Loser', as_index=False)[['Loser', 'loser_id']].first()
mapping = dict(zip(map_player_name_player_id['Loser'], map_player_name_player_id['loser_id']))
df['loser_id'] = df['loser_name'].map(mapping)

In [22]:
print(df.loc[df['loser_id'].isna(), 'loser_name'].unique())

[]


In [23]:
map_player_name_player_id = bets2024.groupby('Winner', as_index=False)[['Winner', 'winner_id']].first()
mapping = dict(zip(map_player_name_player_id['Winner'], map_player_name_player_id['winner_id']))
df['winner_id'] = df['winner_name'].map(mapping)

In [24]:
print(df.loc[df['winner_id'].isna(), 'winner_name'].unique())

[]


In [25]:
bets2024['match_id'] = bets2024['Location'].astype(str) + '_' + '2024' + '_' +  bets2024['winner_id'].astype(str) + '_' + bets2024['loser_id'].astype(str)
df['match_id'] = df['tournament_name'].astype(str) + '_' + '2024' + '_' + df['winner_id'].astype(str) + '_' + df['loser_id'].astype(str)

In [26]:
print(bets2024['match_id'].nunique() == len(bets2024))
print(df['match_id'].nunique() == len(df))

True
True


In [27]:
print(bets2024[~bets2024['match_id'].isin(df['match_id'])]['match_id'].unique())
print(df[~df['match_id'].isin(bets2024['match_id'])]['match_id'].unique())

[]
[]


In [28]:
import json
with open(f"../data/raw_data/player_details.json") as f:
        player_details = json.load(f)
print(player_details)

[{'Name': 'Hamza Reguig Samir', 'Country': 'Algeria', 'Date_of_birth': '2002-01-19', 'Plays': 'right'}, {'Name': 'Makhlouf Mohamed Nazim', 'Country': 'Algeria', 'Date_of_birth': '1996-09-12', 'Plays': 'right'}, {'Name': 'Sahtali Toufik', 'Country': 'Algeria', 'Date_of_birth': '1999-01-07', 'Plays': 'right'}, {'Name': 'Domingos Daniel', 'Country': 'Angola', 'Plays': 'right'}, {'Name': 'Maginley Herbert Jody', 'Country': 'Antigua and Barbuda', 'Height': '193 cm', 'Date_of_birth': '1995-06-07', 'Plays': 'right'}, {'Name': 'Baez Sebastian', 'Country': 'Argentina', 'Height': '170 cm', 'Date_of_birth': '2000-12-28', 'Plays': 'right'}, {'Name': 'Cerundolo Francisco', 'Country': 'Argentina', 'Height': '185 cm', 'Date_of_birth': '1998-08-13', 'Plays': 'right'}, {'Name': 'Etcheverry Tomas Martin', 'Country': 'Argentina', 'Height': '196 cm', 'Date_of_birth': '1999-07-18', 'Plays': 'right'}, {'Name': 'Navone Mariano', 'Country': 'Argentina', 'Date_of_birth': '2001-02-27', 'Plays': 'right'}, {'Name

In [29]:
def manual_name_normalization(full_name):
    if full_name == "McCabe James":
        return "Mccabe J."
    if full_name == "Rehberg Max Hans":
        return "Rehberg M."
    if full_name == "Bailly Gilles Arnaud":
        return "Bailly G."
    if full_name == "Barrios Vera Marcelo Tomas":
        return "Barrios Vera T."
    if full_name == "O'Connell Christopher":
        return "O Connell C."
    if full_name == "Etcheverry Tomas Martin": 
        return 'Etcheverry T.'
    if full_name == "Zhang Zhizhen":
        return 'Zhang Zh.'
    if full_name == "Burruchaga Roman Andres":
        return "Burruchaga R."
    if full_name == "Meligeni Rodrigues Alves Felipe":
        return "Meligeni Alves F."
    if full_name == "Damm Martin (2003)":
        return "Damm M."
    if full_name == "Mpetshi Perricard Giovanni":
        return "Mpetshi G."
    if full_name == "McDonald Mackenzie":
        return "Mcdonald M."
    if full_name == "Hong Seong Chan":
        return "Hong S."
    if full_name == "Gomez Federico Agustin":
        return "Gomez F."
    return None

In [30]:
import re
# Function to normalize and match names
def normalize_name(json_name, df_names):
    """
    Transforms a JSON name to match the format in the DataFrame.
    Returns the transformed name if matched, otherwise None.
    """
    manual_case = manual_name_normalization(json_name)
    if manual_case:
        return manual_case
    # Split the JSON name into parts
    json_parts = re.split(r'[ -]', json_name)  # Split by spaces or dots
    json_parts = [part for part in json_parts if part]     # Normalize against each DataFrame name
    for df_name in df_names:
        df_parts = re.split(r'[ .-]', df_name)  # Split by spaces or dots
        df_parts = [part for part in df_parts if part]         # Check if at least one part matches exactly
    
        exact_match = any(part in json_parts for part in df_parts)
        if not exact_match:
            continue
        # Check remaining parts as initials
        match = True
      
        for json_part in json_parts:
            if json_part not in df_parts:
                # Check if it's an initial of a JSON part
                #print(df_part, f'{json_parts[0][0]}.')
                if not any(df_part == f'{json_part[0]}' for df_part in df_parts):
                    match = False
                    break
        if match:
            return df_name  # Return the matched DataFrame name
    
    
    return json_name

# Extract JSON names
json_names = {d['Name'] for d in player_details}

# Normalize and find missing names
normalized_names = {
    normalize_name(json_name, df['winner_name'].unique()) for json_name in json_names
    if normalize_name(json_name, df['winner_name'].unique()) is not None
}
missing_names = df[~df['loser_name'].isin(normalized_names)]['loser_name'].unique()

# Output the missing names
print("Missing Names:")
print(missing_names)

Missing Names:
['Tu L.' 'Gojo B.' 'Sachko V.' 'Mochizuki S.' 'Pannu K.' 'Valkusz M.'
 'Vesely J.' 'Jasika O.' 'Polmans M.' 'Kopriva V.' 'Prizmic D.'
 'Sweeny D.' 'Svrcina D.' 'Escoffier A.' 'Schwartzman D.' 'Neff A.'
 'Johnson S.' 'Sandgren T.' 'Shelbayh A.' 'Pacheco Mendez R.'
 'Escobedo E.' 'Holt B.' 'Molcan A.' 'Blanch D.' 'Sousa J.' 'Faria J.'
 'Ritschard A.' 'Dougaz A.' 'Benchetrit E.' 'Vacherot V.' 'Topo M.'
 'Gakhov I.' 'Taberner C.' 'Rincon D.' 'Hassan B.' 'Fanselow S.'
 'Barranco Cosano J.' 'Sanchez Izquierdo N.' 'Martineau M.'
 'Van Rijthoven T.' 'Otte O.' 'Broom C.' 'Searle H.' 'Ymer E.' 'Lajal M.'
 'Fery A.' 'Janvier M.' 'Pellegrino A.' 'Borg L.' 'Droguet T.'
 'Schwaerzler J.' 'Collarini A.' 'Couacaud E.' 'Poljicak M.'
 'Andreozzi G.' 'Mikrut L.' 'Martin A.' 'Pospisil V.' 'Mejia N.'
 'Kumar O.' 'Kirchheimer S.' 'Forbes M.' 'Jacquet K.' 'Kasnikowski M.'
 'Dzumhur D.' 'Sun F.' 'Ramanathan R.' 'Gerasimov E.' 'Blockx A.'
 'Vatutin A.' 'Collignon R.' 'Kym J.' 'Guinard M.' 'Papam

In [31]:
for player in player_details:
    player["normalized_name"] = normalize_name(player["Name"], df['winner_name'].unique())
for player in player_details:
    player["normalized_name"] = normalize_name(player["Name"], df['loser_name'].unique())

In [32]:
json_normalized_names = {d['normalized_name'] for d in player_details}

missing_names = df[~df['winner_name'].isin(json_normalized_names)]['winner_name'].unique()
print(missing_names)
missing_names = df[~df['loser_name'].isin(json_normalized_names)]['loser_name'].unique()
print(missing_names)

[]
[]


In [33]:
# Convert player details into a lookup dictionary for easier access
details_lookup = {player['normalized_name']: player for player in player_details}

# Iterate over rows of the DataFrame
for index, row in df.iterrows():
    # Check for winner details
    winner_name = row['winner_name']
    if winner_name in details_lookup:
        winner_details = details_lookup[winner_name]
        for key, value in winner_details.items():
            if key != 'normalized_name':  # Skip normalized_name
                col_name = f"winner_{key}"
                if col_name not in df.columns:
                    df[col_name] = None  # Initialize column if it doesn't exist
                df.at[index, col_name] = value

    # Check for loser details
    loser_name = row['loser_name']
    if loser_name in details_lookup:
        loser_details = details_lookup[loser_name]
        for key, value in loser_details.items():
            if key != 'normalized_name':  # Skip normalized_name
                col_name = f"loser_{key}"
                if col_name not in df.columns:
                    df[col_name] = None  # Initialize column if it doesn't exist
                df.at[index, col_name] = value
df

Unnamed: 0,url,tournament_name,location,surface,round_name,date,time,comment,winner_name,winner_rank,...,winner_Name,winner_Country,winner_Height,winner_Date_of_birth,winner_Plays,loser_Name,loser_Country,loser_Height,loser_Date_of_birth,loser_Plays
0,https://www.livesport.com/en/match/hrB0HLYB/#/...,Brisbane,AUSTRALIA,HARD,FINAL,07.01.2024,07:55,FINISHED,Dimitrov G.,10,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Rune Holger,Denmark,185 cm,2003-04-29,right
1,https://www.livesport.com/en/match/rulXp2l4/#/...,Brisbane,AUSTRALIA,HARD,SEMI-FINALS,06.01.2024,11:40,FINISHED,Dimitrov G.,10,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Thompson Jordan,Australia,183 cm,1994-04-20,right
2,https://www.livesport.com/en/match/nBebMPXT/#/...,Brisbane,AUSTRALIA,HARD,SEMI-FINALS,06.01.2024,06:05,FINISHED,Rune H.,13,...,Rune Holger,Denmark,185 cm,2003-04-29,right,Safiullin Roman,Russia,185 cm,1997-08-07,right
3,https://www.livesport.com/en/match/8tKKk33i/#/...,Brisbane,AUSTRALIA,HARD,QUARTER-FINALS,05.01.2024,11:35,FINISHED,Thompson J.,26,...,Thompson Jordan,Australia,183 cm,1994-04-20,right,Nadal Rafael,Spain,185 cm,1986-06-03,left
4,https://www.livesport.com/en/match/vaNo5gB0/#/...,Brisbane,AUSTRALIA,HARD,QUARTER-FINALS,05.01.2024,08:10,FINISHED,Dimitrov G.,10,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Hijikata Rinky,Australia,178 cm,2001-02-23,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,https://www.livesport.com/en/match/CYzZmNcR/#/...,Belgrade 2,SERBIA,HARD,1/16-FINALS,04.11.2024,12:45,FINISHED,Etcheverry T.,39,...,Etcheverry Tomas Martin,Argentina,196 cm,1999-07-18,right,Kotov Pavel,Russia,191 cm,1998-11-18,right
2684,https://www.livesport.com/en/match/I14clWiA/#/...,Belgrade 2,SERBIA,HARD,1/16-FINALS,04.11.2024,11:10,FINISHED,Cilic M.,181,...,Cilic Marin,Croatia,198 cm,1988-09-28,right,Muller Alexandre,France,183 cm,1997-02-01,right
2685,https://www.livesport.com/en/match/MNU3BtAU/#/...,Belgrade 2,SERBIA,HARD,1/16-FINALS,04.11.2024,11:05,FINISHED,Altmaier D.,88,...,Altmaier Daniel,Germany,191 cm,1998-09-12,right,Darderi Luciano,Italy,,2002-02-14,right
2686,https://www.livesport.com/en/match/84M4r1Ll/#/...,Belgrade 2,SERBIA,HARD,1/16-FINALS,03.11.2024,18:30,FINISHED,Medjedovic H.,113,...,Medjedovic Hamad,Serbia,,2003-07-18,right,Nakashima Bryce Nakashima,USA,,2004-02-20,right


In [34]:
df.columns

Index(['url', 'tournament_name', 'location', 'surface', 'round_name', 'date',
       'time', 'comment', 'winner_name', 'winner_rank', 'winner_seed',
       'loser_name', 'loser_rank', 'loser_seed', 'winner_sets', 'loser_sets',
       'duration', 'winner_Set 1', 'winner_Set 2', 'loser_Set 1',
       'loser_Set 2', 'winner_Aces', 'loser_Aces', 'winner_Double Faults',
       'loser_Double Faults', 'winner_1st Serve Percentage',
       'loser_1st Serve Percentage', 'winner_1st Serve Points Won',
       'loser_1st Serve Points Won', 'winner_2nd Serve Points Won',
       'loser_2nd Serve Points Won', 'winner_Break Points Saved',
       'loser_Break Points Saved', 'winner_1st Return Points Won',
       'loser_1st Return Points Won', 'winner_2nd Return Points Won',
       'loser_2nd Return Points Won', 'winner_Break Points Converted',
       'loser_Break Points Converted', 'winner_Winners', 'loser_Winners',
       'winner_Unforced Errors', 'loser_Unforced Errors',
       'winner_Net Points Won

In [36]:
df['loser_seed'].unique()

array(['1', '', '8', '7', '6', '5', '3', '4', '2', '9', '12', '19', '10',
       '15', '20', '27', '13', '11', '28', '21', '14', '29', '30', '16',
       '26', '23', '24', '32', '17', '22', '25', '31', '18', '33'],
      dtype=object)