In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
livesport = json.load(open('../../raw_data/livesport/match_details_2024.json'))
matches = pd.read_csv("../data/merged_matches_with_time.csv")
matches2023 = matches[matches["match_id"].str.split('_').str[1] == '2023']
bets2024 = pd.read_excel("../../raw_data/bets/2024.xlsx")
cleaned_matches = pd.read_csv("../data/matches.csv")

In [3]:
github_2018 = pd.read_csv("../../preprocessing/data/github_2018-2020.csv")
github_2021 = pd.read_csv("../../preprocessing/data/github_2021-2023.csv")
github_df = pd.concat([github_2018, github_2021], ignore_index=True)  

In [4]:
manual_mapping = {
    'Varillas J. P.': 'Varillas J.P.',
    'Tseng C. H.': 'Tseng C.H.',
}
bets2024['Winner'] = bets2024['Winner'].replace(manual_mapping)
bets2024['Loser'] = bets2024['Loser'].replace(manual_mapping)

In [5]:
livesport

{'https://www.livesport.com/en/match/hrB0HLYB/#/match-summary': {'match_summary': {'tournament_info': {'tournament_name': 'BRISBANE',
    'location': 'AUSTRALIA',
    'surface': 'HARD',
    'court': 'outdoor',
    'round_name': 'FINAL'},
   'time_and_date': {'date': '2024-01-07', 'time': '07:55'},
   'comment': 'FINISHED',
   'player1': {'name': 'Rune H.',
    'rank': '13',
    'is_winner': False,
    'seed': '1'},
   'player2': {'name': 'Dimitrov G.',
    'rank': '10',
    'is_winner': True,
    'seed': '2'},
   'sets_score': {'Wsets': 2, 'Lsets': 0},
   'scores_by_set': {'player1': {'Set 1': '6\n5', 'Set 2': '4'},
    'player2': {'Set 1': '7\n7', 'Set 2': '6'}},
   'duration': '2:19'},
  'match_statistics': {'Aces': ['9', '8'],
   'Double Faults': ['3', '2'],
   '1st Serve Percentage': ['61%', '70%'],
   '1st Serve Points Won': ['76% (44/58)', '77% (40/52)'],
   '2nd Serve Points Won': ['43% (16/37)', '59% (13/22)'],
   'Break Points Saved': ['89% (8/9)', '100% (3/3)'],
   '1st Retur

In [6]:
def transform_data_to_dataframe(data):
    rows = []
    for url, match_data in data.items():
        summary = match_data['match_summary']
        if summary["tournament_info"]["tournament_name"] != "OLYMPIC GAMES":
            stats = match_data.get('match_statistics', {})
            
            if summary['player1']['is_winner']:
                winner, loser = summary['player1'], summary['player2']
            else:
                winner, loser = summary['player2'], summary['player1']
            
            row = {
                "url": url,
                "tournament_location": summary["tournament_info"]["tournament_name"],
                "tournament_country": summary["tournament_info"]["location"],
                "surface": summary["tournament_info"]["surface"],
                "indoor_or_outdoor": summary["tournament_info"]["court"],
                "round_name": summary["tournament_info"]["round_name"],
                "date": summary["time_and_date"]["date"],
                "time": summary["time_and_date"]["time"],
                "comment": summary["comment"],
                "winner_name": winner["name"],
                "winner_rank": winner["rank"],
                "winner_seed": winner.get("seed", ""),
                "loser_name": loser["name"],
                "loser_rank": loser["rank"],
                "loser_seed": loser.get("seed", "")
            }
            
            if summary["comment"] == "WALKOVER" or summary["tournament_info"]["tournament_name"] == "OLYMPIC GAMES":
                row.update({
                    "winner_sets": "",
                    "loser_sets": "",
                    "duration": ""
                })
            else:
                row.update({
                    "winner_sets": summary["sets_score"]["Wsets"],
                    "loser_sets": summary["sets_score"]["Lsets"],
                    "duration": summary["duration"]
                })
                
                winner_scores = summary["scores_by_set"].get("player1" if summary['player1']['is_winner'] else "player2", {})
                loser_scores = summary["scores_by_set"].get("player2" if summary['player1']['is_winner'] else "player1", {})
                
                for set_num, score in winner_scores.items():
                    row[f"winner_{set_num}"] = score
                for set_num, score in loser_scores.items():
                    row[f"loser_{set_num}"] = score
                
                if summary['player1']['is_winner']:
                    for stat_name, values in stats.items():
                        row[f"winner_{stat_name}"] = values[0]
                        row[f"loser_{stat_name}"] = values[1]
                else:
                    for stat_name, values in stats.items():
                        row[f"winner_{stat_name}"] = values[1]
                        row[f"loser_{stat_name}"] = values[0]
            
            rows.append(row)
    
    return pd.DataFrame(rows)

df = transform_data_to_dataframe(livesport)
df

Unnamed: 0,url,tournament_location,tournament_country,surface,indoor_or_outdoor,round_name,date,time,comment,winner_name,...,winner_Set 4,winner_Set 5,loser_Set 4,loser_Set 5,winner_Distance Covered (metres),loser_Distance Covered (metres),winner_Average 1st Serve Speed,loser_Average 1st Serve Speed,winner_Average 2nd Serve Speed,loser_Average 2nd Serve Speed
0,https://www.livesport.com/en/match/hrB0HLYB/#/...,BRISBANE,AUSTRALIA,HARD,outdoor,FINAL,2024-01-07,07:55,FINISHED,Dimitrov G.,...,,,,,,,,,,
1,https://www.livesport.com/en/match/rulXp2l4/#/...,BRISBANE,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,11:40,FINISHED,Dimitrov G.,...,,,,,,,,,,
2,https://www.livesport.com/en/match/nBebMPXT/#/...,BRISBANE,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,06:05,FINISHED,Rune H.,...,,,,,,,,,,
3,https://www.livesport.com/en/match/8tKKk33i/#/...,BRISBANE,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,11:35,FINISHED,Thompson J.,...,,,,,,,,,,
4,https://www.livesport.com/en/match/vaNo5gB0/#/...,BRISBANE,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,08:10,FINISHED,Dimitrov G.,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,https://www.livesport.com/en/match/S2xWCS4T/#/...,HANGZHOU,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,06:40,FINISHED,Hijikata R.,...,,,,,,,,,,
2684,https://www.livesport.com/en/match/hpOm0kLN/#/...,HANGZHOU,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,05:40,FINISHED,Nishioka Y.,...,,,,,,,,,,
2685,https://www.livesport.com/en/match/WzqNElzH/#/...,HANGZHOU,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,13:40,FINISHED,Cilic M.,...,,,,,,,,,,
2686,https://www.livesport.com/en/match/vs6lV9Kc/#/...,HANGZHOU,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,10:40,FINISHED,Kukushkin M.,...,,,,,,,,,,


In [7]:
set1 = set(matches["tournament_location"].str.lower())
set2 = set(df["tournament_location"].str.lower())
print(set2)
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

{'monte carlo', 'mallorca', 'belgrade 2', 'metz', 'wimbledon', 'indian wells', 'marrakech', 'brisbane', 'halle', 'almaty', 'basel', 'newport', 'acapulco', 'paris', 'london', 'buenos aires', 'doha', 'shanghai', 'eastbourne', 'geneva', 'rome', 'cincinnati', 'antwerp', 'gstaad', 'montpellier', 'bucharest', 'us open', 'miami', 'atlanta', 'vienna', 'stuttgart', 'hamburg', 'washington', 'kitzbuhel', 'hangzhou', 'lyon', 'umag', 'marseille', 'santiago', 'montreal', 'los cabos', 'dallas', 'madrid', 'winston-salem', 'adelaide', 'bastad', 'barcelona', 'hong kong', 'houston', 'hertogenbosch', 'beijing', 'australian open', 'tokyo', 'french open', 'rio de janeiro', 'dubai', 'rotterdam', 'estoril', 'cordoba', 'stockholm', 'chengdu', 'delray beach', 'auckland', 'munich'}
Values in only in df1:
{'antalya', 'marbella', 'banja luka', 'toronto', 'cologne 2', 'cagliari', 'seoul', 'parma', 'sardinia', "'s-hertogenbosch", 'moscow', 'budapest', 'shenzhen', 'florence', 'san diego', 'paris 2', 'melbourne', 'ist

In [8]:
df['tournament_location'] = df['tournament_location'].str.lower()

mapping = {
    'hertogenbosch': "'s-hertogenbosch",
    'paris': 'paris 2',
    'london': 'queens club',
    'australian open': 'melbourne',
    'french open': 'paris',
    'wimbledon': 'london',
    'us open': 'new york',
}
df['tournament_location'] = df['tournament_location'].replace(mapping)

In [9]:
set1 = set(matches["tournament_location"].str.lower())
set2 = set(matches2023["tournament_location"].str.lower())
set3 = set(df["tournament_location"].str.lower())
print(set2)
unique_to_df1 = set2 - set3
unique_to_df2 = set3 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

{'monte carlo', 'mallorca', 'metz', 'banja luka', 'indian wells', 'marrakech', 'toronto', 'halle', 'basel', 'newport', 'acapulco', "'s-hertogenbosch", 'pune', 'paris', 'london', 'buenos aires', 'doha', 'shanghai', 'paris 2', 'melbourne', 'eastbourne', 'geneva', 'rome', 'cincinnati', 'antwerp', 'gstaad', 'montpellier', 'new york', 'miami', 'atlanta', 'vienna', 'stuttgart', 'hamburg', 'washington', 'adelaide 1', 'kitzbuhel', 'nur-sultan', 'lyon', 'umag', 'marseille', 'santiago', 'sofia', 'los cabos', 'dallas', 'madrid', 'queens club', 'winston-salem', 'bastad', 'barcelona', 'houston', 'beijing', 'tokyo', 'adelaide 2', 'zhuhai', 'rio de janeiro', 'dubai', 'rotterdam', 'estoril', 'cordoba', 'stockholm', 'chengdu', 'delray beach', 'auckland', 'munich'}
Values in only in df1:
{'banja luka', 'adelaide 1', 'adelaide 2', 'nur-sultan', 'toronto', 'zhuhai', 'sofia', 'pune'}

Values in only in df2:
{'hong kong', 'hangzhou', 'almaty', 'bucharest'}


In [10]:
df['winner_name'] = df['winner_name'].str.lower()
df['loser_name'] = df['loser_name'].str.lower()

set1 = set(bets2024["Winner"].str.lower())
set2 = set(df["winner_name"].str.lower())
set3 = set(bets2024["Loser"].str.lower())
set4 = set(df["loser_name"].str.lower())

set1.update(set3)
set2.update(set4)


unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'gomez f.', 'mpetshi g.', 'schwaerzler j.', 'carreno busta p.', 'struff j.l.', 'varillas j.p.', 'cerundolo j.m.', 'herbert p.h.', 'ramos-vinolas a.', 'burruchaga r.', 'galan d.e.', 'rehberg m.', 'bautista agut r.', 'tseng c.h.', 'barrios m.', 'o connell c.', 'blanch d.', 'zhang zh.', 'hong s.', 'kwon s.w.', 'etcheverry t.', 'bailly g.', 'huesler m.a.', 'tirante t.a.'}

Values in only in df2:
{'blanch dar.', 'tirante t. a.', 'zhang z.', 'huesler m.', 'mpetshi perricard g.', 'schwaerzler j. j.', 'herbert p.', 'rehberg m. h.', 'burruchaga r. a.', 'bailly g. a.', 'ramos a.', 'carreno-busta p.', 'gomez f. a.', 'tseng c. h.', 'etcheverry t. m.', 'barrios vera t.', 'hong s. c.', 'varillas j. p.', 'bautista-agut r.', 'galan d. e.', "o'connell c.", 'struff j-l.', 'kwon s.', 'cerundolo j. m.'}


In [11]:
from fuzzywuzzy import process
fixed_names = {}
for name in set2:
    closest_match = process.extractOne(name, set1)
    if closest_match[0] != name and closest_match[1] > 90:
        print(name, closest_match[0])
        fixed_names[name] = closest_match[0]
    else:
        fixed_names[name] = name



bautista-agut r. bautista agut r.
o'connell c. o connell c.
tirante t. a. tirante t.a.
herbert p. herbert p.h.
cerundolo j. m. cerundolo j.m.
galan d. e. galan d.e.
huesler m. huesler m.a.
carreno-busta p. carreno busta p.
burruchaga r. a. burruchaga r.
bailly g. a. bailly g.
gomez f. a. gomez f.
varillas j. p. varillas j.p.
struff j-l. struff j.l.
kwon s. kwon s.w.
zhang z. zhang zh.
schwaerzler j. j. schwaerzler j.
rehberg m. h. rehberg m.
tseng c. h. tseng c.h.
etcheverry t. m. etcheverry t.


In [12]:
unique_to_df1 = set1 - set(fixed_names.values())
unique_to_df2 = set(fixed_names.values()) - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print()
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'hong s.', 'mpetshi g.', 'ramos-vinolas a.', 'barrios m.', 'blanch d.'}

Values in only in df2:
{'blanch dar.', 'barrios vera t.', 'ramos a.', 'hong s. c.', 'mpetshi perricard g.'}


In [13]:
manual_mapping = {
    'hong s. c.': 'hong s.',
    'ramos a.': 'ramos-vinolas a.',
    'blanch dar.': 'blanch d.',
    'mpetshi perricard g.': 'mpetshi g.',
}
print(sorted(set(df['winner_name'].unique())))
df['winner_name'] = df['winner_name'].replace(fixed_names)
df['loser_name'] = df['loser_name'].replace(fixed_names)

df['winner_name'] = df['winner_name'].replace(manual_mapping)
df['loser_name'] = df['loser_name'].replace(manual_mapping)
manual_mapping = {
    'Barrios M.': 'Barrios Vera T.'
}
bets2024['Winner'] = bets2024['Winner'].replace(manual_mapping)
bets2024['Loser'] = bets2024['Loser'].replace(manual_mapping)


['ajdukovic d.', 'albot r.', 'alcaraz c.', 'altmaier d.', 'arnaldi m.', 'atmane t.', 'auger-aliassime f.', 'baez s.', 'bagnis f.', 'barrere g.', 'barrios vera t.', 'bautista-agut r.', 'bellucci m.', 'bergs z.', 'berrettini m.', 'blanchet u.', 'bolt a.', 'bonzi b.', 'borges n.', 'broady l.', 'brouwer g.', 'bu y.', 'bublik a.', 'burruchaga r. a.', 'cachin p.', 'carballes baena r.', 'carreno-busta p.', 'cazaux a.', 'cerundolo f.', 'cerundolo j. m.', 'choinski j.', 'cilic m.', 'cobolli f.', 'comesana f.', 'coria f.', 'coric b.', 'cressy m.', 'damm m.', 'daniel t.', 'darderi l.', 'davidovich fokina a.', 'de jong j.', 'de minaur a.', 'dellien h.', 'diallo g.', 'diaz acosta f.', 'dimitrov g.', 'djere l.', 'djokovic n.', 'dodig m.', 'draper j.', 'duckworth j.', 'engel j.', 'etcheverry t. m.', 'eubanks c.', 'evans d.', 'fearnley j.', 'fils a.', 'fognini f.', 'fonseca j.', 'fritz t.', 'fucsovics m.', 'galan d. e.', 'garin c.', 'gasquet r.', 'gaston h.', 'gigante m.', 'giron m.', 'goffin d.', 'go

In [14]:
set1 = set(bets2024["Winner"].str.lower())
set2 = set(df["winner_name"].str.lower())
set3 = set(bets2024["Loser"].str.lower())
set4 = set(df["loser_name"].str.lower())
set1.update(set3)
set2.update(set4)


unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
set()
Values in only in df2:
set()


In [15]:
set1 = set(bets2024["Location"].str.lower())
set2 = set(df["tournament_location"].str.lower())
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
{'belgrade', 'turin', 'dubai '}
Values in only in df2:
{'dubai', 'paris 2', 'belgrade 2'}


In [16]:
bets2024.loc[bets2024['Tournament']== 'BNP Paribas Masters', 'Location'] ='Paris 2'
bets2024.loc[bets2024['Tournament']== 'Belgrade Open', 'Location'] ='Belgrade 2'
bets2024.loc[bets2024['Location']== 'Dubai ', 'Location'] ='Dubai'

In [17]:
bets2024 = bets2024[~bets2024['Tournament'].isin(['Masters Cup'])]

In [18]:
set1 = set(bets2024["Location"].str.lower())
set2 = set(df["tournament_location"].str.lower())
unique_to_df1 = set1 - set2
unique_to_df2 = set2 - set1

print(f"Values in only in df1:")
print(unique_to_df1)
print(f"Values in only in df2:")
print(unique_to_df2)

Values in only in df1:
set()
Values in only in df2:
set()


In [19]:
import random

map_loser_name_id = github_df.groupby('shortened_loser_name', as_index=False)[['shortened_loser_name', 'loser_id']].first()
map_winner_name_id = github_df.groupby('shortened_winner_name', as_index=False)[['shortened_winner_name', 'winner_id']].first()

mapping = {**dict(zip(map_loser_name_id['shortened_loser_name'], map_loser_name_id['loser_id'])),
           **dict(zip(map_winner_name_id['shortened_winner_name'], map_winner_name_id['winner_id']))}

existing_ids = set(mapping.values())

def generate_or_get_id(player_name):
    if player_name in mapping:
        return mapping[player_name]
    new_id = str(random.randint(100000, 999999))
    while new_id in existing_ids:
        new_id = str(random.randint(100000, 999999))
    mapping[player_name] = new_id
    existing_ids.add(new_id)
    return new_id

bets2024['loser_id'] = bets2024['Loser'].apply(generate_or_get_id)
bets2024['winner_id'] = bets2024['Winner'].apply(generate_or_get_id)

In [20]:
print(bets2024.loc[bets2024['loser_id'].isna(), 'Loser'].unique())

[]


In [21]:
name_mapping = {name.lower(): name for name in bets2024['Loser']}
df['loser_name'] = df['loser_name'].str.lower().map(name_mapping).fillna(df['loser_name'])
name_mapping = {name.lower(): name for name in bets2024['Winner']}
df['winner_name'] = df['winner_name'].str.lower().map(name_mapping).fillna(df['winner_name'])
tournament_mapping = {tournament.lower(): tournament for tournament in bets2024['Location']}
df['tournament_location'] = df['tournament_location'].str.lower().map(tournament_mapping).fillna(df['tournament_location'])

In [22]:
map_player_name_player_id = bets2024.groupby('Loser', as_index=False)[['Loser', 'loser_id']].first()
mapping = dict(zip(map_player_name_player_id['Loser'], map_player_name_player_id['loser_id']))
df['loser_id'] = df['loser_name'].map(mapping)

In [23]:
print(df.loc[df['loser_id'].isna(), 'loser_name'].unique())

[]


In [24]:
map_player_name_player_id = bets2024.groupby('Winner', as_index=False)[['Winner', 'winner_id']].first()
mapping = dict(zip(map_player_name_player_id['Winner'], map_player_name_player_id['winner_id']))
df['winner_id'] = df['winner_name'].map(mapping)

In [25]:
print(df.loc[df['winner_id'].isna(), 'winner_name'].unique())

[]


In [26]:
bets2024['match_id'] = bets2024['Location'].astype(str) + '_' + '2024' + '_' +  bets2024['winner_id'].astype(str) + '_' + bets2024['loser_id'].astype(str)
df['match_id'] = df['tournament_location'].astype(str) + '_' + '2024' + '_' + df['winner_id'].astype(str) + '_' + df['loser_id'].astype(str)

In [27]:
print(bets2024['match_id'].nunique() == len(bets2024))
print(df['match_id'].nunique() == len(df))

True
True


In [28]:
print(bets2024[~bets2024['match_id'].isin(df['match_id'])]['match_id'].unique())
print(df[~df['match_id'].isin(bets2024['match_id'])]['match_id'].unique())

[]
[]


In [29]:
import json
with open(f"../../raw_data/tennis_explorer/player_details.json") as f:
        player_details = json.load(f)
print(player_details)

[{'Name': 'Hamza Reguig Samir', 'Country': 'Algeria', 'Date_of_birth': '2002-01-19', 'Plays': 'right'}, {'Name': 'Makhlouf Mohamed Nazim', 'Country': 'Algeria', 'Date_of_birth': '1996-09-12', 'Plays': 'right'}, {'Name': 'Sahtali Toufik', 'Country': 'Algeria', 'Date_of_birth': '1999-01-07', 'Plays': 'right'}, {'Name': 'Domingos Daniel', 'Country': 'Angola', 'Plays': 'right'}, {'Name': 'Maginley Herbert Jody', 'Country': 'Antigua and Barbuda', 'Height': '193 cm', 'Date_of_birth': '1995-06-07', 'Plays': 'right'}, {'Name': 'Baez Sebastian', 'Country': 'Argentina', 'Height': '170 cm', 'Date_of_birth': '2000-12-28', 'Plays': 'right'}, {'Name': 'Cerundolo Francisco', 'Country': 'Argentina', 'Height': '185 cm', 'Date_of_birth': '1998-08-13', 'Plays': 'right'}, {'Name': 'Etcheverry Tomas Martin', 'Country': 'Argentina', 'Height': '196 cm', 'Date_of_birth': '1999-07-18', 'Plays': 'right'}, {'Name': 'Navone Mariano', 'Country': 'Argentina', 'Date_of_birth': '2001-02-27', 'Plays': 'right'}, {'Name

In [30]:
def manual_name_normalization(full_name):
    if full_name == "McCabe James":
        return "Mccabe J."
    if full_name == "Rehberg Max Hans":
        return "Rehberg M."
    if full_name == "Bailly Gilles Arnaud":
        return "Bailly G."
    if full_name == "Barrios Vera Marcelo Tomas":
        return "Barrios Vera T."
    if full_name == "O'Connell Christopher":
        return "O Connell C."
    if full_name == "Etcheverry Tomas Martin": 
        return 'Etcheverry T.'
    if full_name == "Zhang Zhizhen":
        return 'Zhang Zh.'
    if full_name == "Burruchaga Roman Andres":
        return "Burruchaga R."
    if full_name == "Meligeni Rodrigues Alves Felipe":
        return "Meligeni Alves F."
    if full_name == "Damm Martin (2003)":
        return "Damm M."
    if full_name == "Mpetshi Perricard Giovanni":
        return "Mpetshi G."
    if full_name == "McDonald Mackenzie":
        return "Mcdonald M."
    if full_name == "Hong Seong Chan":
        return "Hong S."
    if full_name == "Gomez Federico Agustin":
        return "Gomez F."
    return None

In [31]:
import re
def normalize_name(json_name, df_names):
    manual_case = manual_name_normalization(json_name)
    if manual_case:
        return manual_case
    json_parts = re.split(r'[ -]', json_name)
    json_parts = [part for part in json_parts if part]
    for df_name in df_names:
        df_parts = re.split(r'[ .-]', df_name)
        df_parts = [part for part in df_parts if part]
    
        exact_match = any(part in json_parts for part in df_parts)
        if not exact_match:
            continue
        match = True
      
        for json_part in json_parts:
            if json_part not in df_parts:
                if not any(df_part == f'{json_part[0]}' for df_part in df_parts):
                    match = False
                    break
        if match:
            return df_name
    
    
    return json_name

json_names = {d['Name'] for d in player_details}

normalized_names = {
    normalize_name(json_name, df['winner_name'].unique()) for json_name in json_names
    if normalize_name(json_name, df['winner_name'].unique()) is not None
}
missing_names = df[~df['loser_name'].isin(normalized_names)]['loser_name'].unique()

print("Missing Names:")
print(missing_names)

Missing Names:
['Tu L.' 'Gojo B.' 'Sachko V.' 'Mochizuki S.' 'Pannu K.' 'Valkusz M.'
 'Vesely J.' 'Jasika O.' 'Polmans M.' 'Kopriva V.' 'Prizmic D.'
 'Sweeny D.' 'Svrcina D.' 'Escoffier A.' 'Schwartzman D.' 'Neff A.'
 'Johnson S.' 'Sandgren T.' 'Shelbayh A.' 'Pacheco Mendez R.'
 'Escobedo E.' 'Holt B.' 'Molcan A.' 'Blanch D.' 'Sousa J.' 'Faria J.'
 'Ritschard A.' 'Dougaz A.' 'Benchetrit E.' 'Vacherot V.' 'Topo M.'
 'Gakhov I.' 'Taberner C.' 'Rincon D.' 'Hassan B.' 'Fanselow S.'
 'Barranco Cosano J.' 'Sanchez Izquierdo N.' 'Martineau M.'
 'Van Rijthoven T.' 'Otte O.' 'Broom C.' 'Searle H.' 'Ymer E.' 'Lajal M.'
 'Fery A.' 'Janvier M.' 'Pellegrino A.' 'Borg L.' 'Droguet T.'
 'Schwaerzler J.' 'Collarini A.' 'Couacaud E.' 'Poljicak M.'
 'Andreozzi G.' 'Mikrut L.' 'Martin A.' 'Pospisil V.' 'Mejia N.'
 'Kumar O.' 'Kirchheimer S.' 'Forbes M.' 'Jacquet K.' 'Kasnikowski M.'
 'Sun F.' 'Ramanathan R.' 'Gerasimov E.' 'Dzumhur D.' 'Blockx A.'
 'Vatutin A.' 'Collignon R.' 'Kym J.' 'Guinard M.' 'Papam

In [32]:
for player in player_details:
    player["normalized_name"] = normalize_name(player["Name"], df['winner_name'].unique())
for player in player_details:
    player["normalized_name"] = normalize_name(player["Name"], df['loser_name'].unique())

In [36]:
json_normalized_names = {d['normalized_name'] for d in player_details}

missing_names = df[~df['winner_name'].isin(json_normalized_names)]['winner_name'].unique()
print(missing_names)
missing_names = df[~df['loser_name'].isin(json_normalized_names)]['loser_name'].unique()
print(missing_names)

[]
[]


In [37]:
details_lookup = {player['normalized_name']: player for player in player_details}

for index, row in df.iterrows():
    winner_name = row['winner_name']
    if winner_name in details_lookup:
        winner_details = details_lookup[winner_name]
        for key, value in winner_details.items():
            if key != 'normalized_name':
                col_name = f"winner_{key}"
                if col_name not in df.columns:
                    df[col_name] = None 
                df.at[index, col_name] = value

    loser_name = row['loser_name']
    if loser_name in details_lookup:
        loser_details = details_lookup[loser_name]
        for key, value in loser_details.items():
            if key != 'normalized_name':
                col_name = f"loser_{key}"
                if col_name not in df.columns:
                    df[col_name] = None
                df.at[index, col_name] = value
df

Unnamed: 0,url,tournament_location,tournament_country,surface,indoor_or_outdoor,round_name,date,time,comment,winner_name,...,winner_Name,winner_Country,winner_Height,winner_Date_of_birth,winner_Plays,loser_Name,loser_Country,loser_Height,loser_Date_of_birth,loser_Plays
0,https://www.livesport.com/en/match/hrB0HLYB/#/...,Brisbane,AUSTRALIA,HARD,outdoor,FINAL,2024-01-07,07:55,FINISHED,Dimitrov G.,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Rune Holger,Denmark,185 cm,2003-04-29,right
1,https://www.livesport.com/en/match/rulXp2l4/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,11:40,FINISHED,Dimitrov G.,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Thompson Jordan,Australia,183 cm,1994-04-20,right
2,https://www.livesport.com/en/match/nBebMPXT/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,06:05,FINISHED,Rune H.,...,Rune Holger,Denmark,185 cm,2003-04-29,right,Safiullin Roman,Russia,185 cm,1997-08-07,right
3,https://www.livesport.com/en/match/8tKKk33i/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,11:35,FINISHED,Thompson J.,...,Thompson Jordan,Australia,183 cm,1994-04-20,right,Nadal Rafael,Spain,185 cm,1986-06-03,left
4,https://www.livesport.com/en/match/vaNo5gB0/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,08:10,FINISHED,Dimitrov G.,...,Dimitrov Grigor,Bulgaria,191 cm,1991-05-16,right,Hijikata Rinky,Australia,178 cm,2001-02-23,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,https://www.livesport.com/en/match/S2xWCS4T/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,06:40,FINISHED,Hijikata R.,...,Hijikata Rinky,Australia,178 cm,2001-02-23,right,Dzumhur Damir,Bosnia and Herzeg.,175 cm,1992-05-20,right
2684,https://www.livesport.com/en/match/hpOm0kLN/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,05:40,FINISHED,Nishioka Y.,...,Nishioka Yoshihito,Japan,170 cm,1995-09-27,left,Marterer Maximilian,Germany,191 cm,1995-06-15,left
2685,https://www.livesport.com/en/match/WzqNElzH/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,13:40,FINISHED,Cilic M.,...,Cilic Marin,Croatia,198 cm,1988-09-28,right,Svajda Zachary,USA,175 cm,,right
2686,https://www.livesport.com/en/match/vs6lV9Kc/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,10:40,FINISHED,Kukushkin M.,...,Kukushkin Mikhail,Kazakhstan,183 cm,1987-12-26,right,Darderi Luciano,Italy,,2002-02-14,right


In [38]:
df.columns

Index(['url', 'tournament_location', 'tournament_country', 'surface',
       'indoor_or_outdoor', 'round_name', 'date', 'time', 'comment',
       'winner_name', 'winner_rank', 'winner_seed', 'loser_name', 'loser_rank',
       'loser_seed', 'winner_sets', 'loser_sets', 'duration', 'winner_Set 1',
       'winner_Set 2', 'loser_Set 1', 'loser_Set 2', 'winner_Aces',
       'loser_Aces', 'winner_Double Faults', 'loser_Double Faults',
       'winner_1st Serve Percentage', 'loser_1st Serve Percentage',
       'winner_1st Serve Points Won', 'loser_1st Serve Points Won',
       'winner_2nd Serve Points Won', 'loser_2nd Serve Points Won',
       'winner_Break Points Saved', 'loser_Break Points Saved',
       'winner_1st Return Points Won', 'loser_1st Return Points Won',
       'winner_2nd Return Points Won', 'loser_2nd Return Points Won',
       'winner_Break Points Converted', 'loser_Break Points Converted',
       'winner_Winners', 'loser_Winners', 'winner_Unforced Errors',
       'loser_Unfor

In [39]:
matches.columns

Index(['tournament_location', 'tournament_name', 'Date', 'tournament_level',
       'indoor_or_outdoor', 'Surface', 'Round', 'W1', 'L1', 'W2', 'L2', 'W3',
       'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'AvgW',
       'AvgL', 'loser_id', 'winner_id', 'match_id', 'tournament_id',
       'draw_size', 'tournament_date', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht',
       'loser_ioc', 'loser_age', 'best_of', 'minutes', 'w_ace', 'w_df',
       'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved',
       'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
       'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank',
       'winner_rank_points', 'loser_rank', 'loser_rank_points', 'time'],
      dtype='object')

In [40]:
df[['w_1stWon', 'w_1stIn']] = df['winner_1st Serve Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['winner_1st Serve Points Won'], inplace=True)

df[['l_1stWon', 'l_1stIn']] = df['loser_1st Serve Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['loser_1st Serve Points Won'], inplace=True)

df[['to_drop', 'w_SvGms']] = df['winner_Service Games Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['winner_Service Games Won'], inplace=True)

df[['to_drop', 'l_SvGms']] = df['loser_Service Games Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['loser_Service Games Won'], inplace=True)

df[['w_2ndWon', 'to_drop']] = df['winner_2nd Serve Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['winner_2nd Serve Points Won'], inplace=True)

df[['l_2ndWon', 'to_drop']] = df['loser_2nd Serve Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['loser_2nd Serve Points Won'], inplace=True)

df[['w_bpSaved', 'w_bpFaced']] = df['winner_Break Points Saved'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['winner_Break Points Saved'], inplace=True)

df[['l_bpSaved', 'l_bpFaced']] = df['loser_Break Points Saved'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['loser_Break Points Saved'], inplace=True)

df[['to_drop', 'w_svpt']] = df['winner_Service Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['winner_Service Points Won'], inplace=True)

df[['to_drop', 'l_svpt']] = df['loser_Service Points Won'].str.extract(r'\((\d+)/(\d+)\)')
df.drop(columns=['loser_Service Points Won'], inplace=True)

In [41]:
rename_mapping = {
    'location': 'tournament_location',
    'surface': 'Surface',
    'comment': 'Comment',
    'round_name': 'Round',
    'date': 'Date',
    'time': 'time',
    'winner_name': 'winner_name',
    'loser_name': 'loser_name',
    'winner_Set 1': 'W1',
    'loser_Set 1': 'L1',
    'winner_Set 2': 'W2',
    'loser_Set 2': 'L2',
    'winner_Set 3': 'W3',
    'loser_Set 3': 'L3',
    'winner_Set 4': 'W4',
    'loser_Set 4': 'L4',
    'winner_Set 5': 'W5',
    'loser_Set 5': 'L5',
    'winner_sets': 'Wsets',
    'loser_sets': 'Lsets',
    'duration': 'minutes',
    'winner_Aces': 'w_ace',
    'loser_Aces': 'l_ace',
    'winner_Double Faults': 'w_df',
    'loser_Double Faults': 'l_df',
    'winner_Break Points Saved': 'w_bpSaved',
    'loser_Break Points Saved': 'l_bpSaved',
    'winner_rank': 'winner_rank',
    'loser_rank': 'loser_rank',
    'winner_Country': 'winner_ioc',
    'loser_Country': 'loser_ioc',
    'winner_Plays': 'winner_hand',
    'loser_Plays': 'loser_hand'
}
columns_to_drop = [
    'url'
]

df.rename(columns=rename_mapping, inplace=True)

In [42]:
missing_columns = [col for col in matches if col not in df.columns]
if missing_columns:
    print(f"Missing columns in pd_df_1 that need to be added: {missing_columns}")

Missing columns in pd_df_1 that need to be added: ['tournament_name', 'tournament_level', 'AvgW', 'AvgL', 'tournament_id', 'draw_size', 'tournament_date', 'winner_entry', 'winner_ht', 'winner_age', 'loser_entry', 'loser_ht', 'loser_age', 'best_of', 'winner_rank_points', 'loser_rank_points']


In [43]:
print(df['loser_seed'].unique())
matches['loser_entry'].unique()

['1' '' 'Q/LL' '8' '7' '6' '5' '3' '4' '2' 'WC' 'Q' 'PR' 'LL' '9' '12'
 '19' '10' '15' '20' '27' '13' '11' '28' '21' '14' '29' '30' '16' '26'
 '23' '24' '32' '17' '22' '25' '31' '18' 'Alt' '33' 'SE']


array([nan, 'Q', 'WC', 'LL', 'PR', 'ALT', 'SE', 'Alt'], dtype=object)

In [44]:
def process_seed(value):
    if value.isdigit() or value == '':
        return value, ''
    elif value == 'Q/LL':
        return '', 'Q'
    else:
        return '', value

df['winner_seed'], df['winner_entry'] = zip(*df['winner_seed'].apply(process_seed))
df['loser_seed'], df['loser_entry'] = zip(*df['loser_seed'].apply(process_seed))


In [45]:
print(df['loser_seed'].unique())
print(matches['loser_seed'].unique())

['1' '' '8' '7' '6' '5' '3' '4' '2' '9' '12' '19' '10' '15' '20' '27' '13'
 '11' '28' '21' '14' '29' '30' '16' '26' '23' '24' '32' '17' '22' '25'
 '31' '18' '33']
[ 6. nan  5.  7.  8.  4.  1.  2.  3. 11. 16. 27. 18. 22. 20. 32. 31. 13.
  9. 23. 30. 15. 28. 26. 12. 21. 29. 10. 24. 17. 25. 14. 19. 33.]


In [46]:
print(df['loser_entry'].unique())
print(matches['loser_entry'].unique())

['' 'Q' 'WC' 'PR' 'LL' 'Alt' 'SE']
[nan 'Q' 'WC' 'LL' 'PR' 'ALT' 'SE' 'Alt']


In [47]:
from datetime import datetime
def calculate_age(dob):
    if pd.isna(dob):
        return None
    dob = datetime.strptime(dob, '%Y-%m-%d') 
    today = datetime.today()  
    return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))

df['winner_age'] = df['winner_Date_of_birth'].apply(calculate_age)
df['loser_age'] = df['loser_Date_of_birth'].apply(calculate_age)

In [48]:
bets2024.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL', 'loser_id', 'winner_id', 'match_id'],
      dtype='object')

In [49]:
df = pd.merge(
    df, 
    bets2024[['match_id', 'Series', 'AvgW', 'AvgL', 'Best of', 'Tournament']],
    on='match_id', 
    how='inner'
)

In [50]:
rename_mapping = {
    "Series": 'tournament_level',
    'Best of': 'best_of',
    'Tournament': 'tournament_name'
}
df.rename(columns=rename_mapping, inplace=True)

In [51]:
missing_columns = [col for col in matches if col not in df.columns]
if missing_columns:
    print(f"Missing columns in pd_df_1 that need to be added: {missing_columns}")

Missing columns in pd_df_1 that need to be added: ['tournament_id', 'draw_size', 'tournament_date', 'winner_ht', 'loser_ht', 'winner_rank_points', 'loser_rank_points']


In [52]:
df

Unnamed: 0,url,tournament_location,tournament_country,Surface,indoor_or_outdoor,Round,Date,time,Comment,winner_name,...,l_svpt,winner_entry,loser_entry,winner_age,loser_age,tournament_level,AvgW,AvgL,best_of,tournament_name
0,https://www.livesport.com/en/match/hrB0HLYB/#/...,Brisbane,AUSTRALIA,HARD,outdoor,FINAL,2024-01-07,07:55,FINISHED,Dimitrov G.,...,95,,,33.0,21.0,ATP250,1.93,1.86,3,Brisbane International
1,https://www.livesport.com/en/match/rulXp2l4/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,11:40,FINISHED,Dimitrov G.,...,62,,,33.0,30.0,ATP250,1.23,4.07,3,Brisbane International
2,https://www.livesport.com/en/match/nBebMPXT/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,06:05,FINISHED,Rune H.,...,66,,,21.0,27.0,ATP250,1.56,2.40,3,Brisbane International
3,https://www.livesport.com/en/match/8tKKk33i/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,11:35,FINISHED,Thompson J.,...,108,,,30.0,38.0,ATP250,5.54,1.14,3,Brisbane International
4,https://www.livesport.com/en/match/vaNo5gB0/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,08:10,FINISHED,Dimitrov G.,...,57,,,33.0,23.0,ATP250,1.13,5.88,3,Brisbane International
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,https://www.livesport.com/en/match/S2xWCS4T/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,06:40,FINISHED,Hijikata R.,...,99,,,23.0,32.0,ATP250,1.74,2.08,3,Hangzhou Open
2684,https://www.livesport.com/en/match/hpOm0kLN/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,05:40,FINISHED,Nishioka Y.,...,45,,,29.0,29.0,ATP250,1.37,3.05,3,Hangzhou Open
2685,https://www.livesport.com/en/match/WzqNElzH/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,13:40,FINISHED,Cilic M.,...,77,WC,,36.0,,ATP250,1.76,2.04,3,Hangzhou Open
2686,https://www.livesport.com/en/match/vs6lV9Kc/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,10:40,FINISHED,Kukushkin M.,...,56,,,37.0,22.0,ATP250,2.19,1.66,3,Hangzhou Open


In [53]:
df.drop(columns=['winner_Winners', 'loser_Winners', 'winner_Unforced Errors', 'loser_Unforced Errors', 'winner_Net Points Won', 'loser_Net Points Won', 'winner_Return Points Won',
                 'loser_Return Points Won', 'winner_Total Points Won', 'loser_Total Points Won', 'winner_Last 10 Balls', 'loser_Last 10 Balls', 'winner_Match Points Saved', 'loser_Match Points Saved', 'winner_Return Games Won', 'loser_Return Games Won', 'winner_Total Games Won', 'loser_Total Games Won', 'winner_1st Serve Percentage', 'loser_1st Serve Percentage', 'winner_1st Return Points Won', 'loser_1st Return Points Won', 'winner_2nd Return Points Won', 'loser_2nd Return Points Won', 'winner_Break Points Converted', 
                 'loser_Break Points Converted', 'winner_Distance Covered (metres)', 'loser_Distance Covered (metres)', 'winner_Average 1st Serve Speed', 'loser_Average 1st Serve Speed', 'winner_Average 2nd Serve Speed', 'loser_Average 2nd Serve Speed', 'to_drop', 'winner_Height', 'loser_Height'], inplace=True)

In [56]:
df.to_csv("../data/matches_2024_raw.csv", index=False)

In [54]:
missing_values = df.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
columns_with_nan

Columns with NaN values and their counts:


W1                        29
W2                        65
L1                        29
L2                        65
w_ace                     19
l_ace                     19
w_df                      19
l_df                      19
W3                      1441
L3                      1441
W4                      2402
W5                      2569
L4                      2402
L5                      2569
winner_Date_of_birth       6
loser_Date_of_birth       14
w_1stWon                  19
w_1stIn                   19
l_1stWon                  19
l_1stIn                   19
w_SvGms                   19
l_SvGms                   19
w_2ndWon                  19
l_2ndWon                  19
w_bpSaved                 19
w_bpFaced                 19
l_bpSaved                 19
l_bpFaced                 19
w_svpt                    19
l_svpt                    19
winner_age                 6
loser_age                 14
AvgW                       5
AvgL                       5
dtype: int64

In [56]:
df['Comment'] = df['Comment'].apply(lambda x: 'RETIRED' if 'RETIRED' in x else x)
df['Comment'] = df['Comment'].str.lower()
df['Comment'].unique()

array(['finished', 'walkover', 'retired', 'awarded'], dtype=object)

In [57]:
columns_to_update = ['W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2', 'L3', 'L4', 'L5','Wsets','Lsets', 'minutes']
df[columns_to_update] = df[columns_to_update].replace('', np.nan)
df.loc[df['Comment'].isin(['disqualified','retired', 'walkover', 'awarded']), columns_to_update]=df.loc[df['Comment'].isin(['disqualified','retired', 'walkover', 'awarded']), columns_to_update].fillna(0)

  df[columns_to_update] = df[columns_to_update].replace('', np.nan)
  df.loc[df['Comment'].isin(['disqualified','retired', 'walkover', 'awarded']), columns_to_update]=df.loc[df['Comment'].isin(['disqualified','retired', 'walkover', 'awarded']), columns_to_update].fillna(0)


In [58]:
values_to_encode = ['WC', 'Q', 'LL']

df['winner_entry'] = df['winner_entry'].where(df['winner_entry'].isin(values_to_encode))
df['loser_entry'] = df['loser_entry'].where(df['loser_entry'].isin(values_to_encode))

df = pd.get_dummies(df, columns=['winner_entry'], prefix='winner_entry', prefix_sep='_')
df = pd.get_dummies(df, columns=['loser_entry'], prefix='loser_entry', prefix_sep='_')

In [59]:
df['winner_seed'].unique()

array(['2', '1', '', '4', '8', '3', '6', '5', '7', '9', '12', '19', '10',
       '20', '15', '27', '13', '21', '28', '14', '11', '29', '30', '16',
       '26', '32', '24', '23', '25', '17', '22', '18', '31', '33'],
      dtype=object)

In [60]:
import numpy as np
df['winner_seed'] = df['winner_seed'].replace('', np.nan)
df['loser_seed'] = df['loser_seed'].replace('', np.nan)

df['winner_is_seeded'] = df['winner_seed'].notna().astype(int)
df['loser_is_seeded'] = df['loser_seed'].notna().astype(int)
df=df.drop(columns=['winner_seed', 'loser_seed'])

In [61]:
average_minutes = df[df['Comment'] == 'completed'].groupby('best_of')['minutes'].mean().round()
df.loc[(df['Comment'] == 'completed') & (df['minutes'].isnull()), 'minutes'] = df['best_of'].map(average_minutes)

In [62]:
df['winner_rank'] = df['winner_rank'].replace('', np.nan)
df['loser_rank'] = df['loser_rank'].replace('', np.nan)

df.loc[df['winner_rank'].isnull(), 'winner_rank'] = 2000
df.loc[df['loser_rank'].isnull(), 'loser_rank'] = 2000

In [63]:
missing_values = df.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
columns_with_nan

Columns with NaN values and their counts:


w_ace                     19
l_ace                     19
w_df                      19
l_df                      19
W3                      1353
L3                      1353
W4                      2307
W5                      2473
L4                      2307
L5                      2473
winner_Date_of_birth       6
loser_Date_of_birth       14
w_1stWon                  19
w_1stIn                   19
l_1stWon                  19
l_1stIn                   19
w_SvGms                   19
l_SvGms                   19
w_2ndWon                  19
l_2ndWon                  19
w_bpSaved                 19
w_bpFaced                 19
l_bpSaved                 19
l_bpFaced                 19
w_svpt                    19
l_svpt                    19
winner_age                 6
loser_age                 14
AvgW                       5
AvgL                       5
dtype: int64

In [64]:
df[df[('minutes')].isna()]

Unnamed: 0,url,tournament_location,tournament_country,Surface,indoor_or_outdoor,Round,Date,time,Comment,winner_name,...,best_of,tournament_name,winner_entry_LL,winner_entry_Q,winner_entry_WC,loser_entry_LL,loser_entry_Q,loser_entry_WC,winner_is_seeded,loser_is_seeded


In [65]:
missing_values = cleaned_matches.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
columns_with_nan

Columns with NaN values and their counts:


W3            6832
L3            6832
W4           12171
L4           12171
W5           13119
L5           13119
AvgW            12
AvgL            12
winner_ht       37
loser_ht        99
w_ace          100
w_df           100
w_svpt         100
w_1stIn        100
w_1stWon       100
w_2ndWon       100
w_SvGms         99
w_bpSaved      100
w_bpFaced      100
l_ace          100
l_df           100
l_svpt         100
l_1stIn        100
l_1stWon       100
l_2ndWon       100
l_SvGms         99
l_bpSaved      100
l_bpFaced      100
dtype: int64

In [66]:
df['tournament_id'] = df['tournament_location'].astype(str) + '_2024'

In [67]:
missing_columns = [col for col in cleaned_matches if col not in df.columns]
if missing_columns:
    print(f"Missing columns in pd_df_1 that need to be added: {missing_columns}")

Missing columns in pd_df_1 that need to be added: ['draw_size', 'tournament_date', 'winner_ht', 'loser_ht', 'winner_rank_points', 'loser_rank_points']


In [68]:
missing_columns = [col for col in df if col not in cleaned_matches.columns]
if missing_columns:
    print(f"Missing columns in pd_df_2 that need to be added: {missing_columns}")

Missing columns in pd_df_2 that need to be added: ['url', 'tournament_country', 'winner_Name', 'winner_Date_of_birth', 'loser_Name', 'loser_Date_of_birth']


In [69]:
df.columns

Index(['url', 'tournament_location', 'tournament_country', 'Surface',
       'indoor_or_outdoor', 'Round', 'Date', 'time', 'Comment', 'winner_name',
       'winner_rank', 'loser_name', 'loser_rank', 'Wsets', 'Lsets', 'minutes',
       'W1', 'W2', 'L1', 'L2', 'w_ace', 'l_ace', 'w_df', 'l_df', 'W3', 'L3',
       'W4', 'W5', 'L4', 'L5', 'loser_id', 'winner_id', 'match_id',
       'winner_Name', 'winner_ioc', 'winner_Date_of_birth', 'winner_hand',
       'loser_Name', 'loser_ioc', 'loser_Date_of_birth', 'loser_hand',
       'w_1stWon', 'w_1stIn', 'l_1stWon', 'l_1stIn', 'w_SvGms', 'l_SvGms',
       'w_2ndWon', 'l_2ndWon', 'w_bpSaved', 'w_bpFaced', 'l_bpSaved',
       'l_bpFaced', 'w_svpt', 'l_svpt', 'winner_age', 'loser_age',
       'tournament_level', 'AvgW', 'AvgL', 'best_of', 'tournament_name',
       'winner_entry_LL', 'winner_entry_Q', 'winner_entry_WC',
       'loser_entry_LL', 'loser_entry_Q', 'loser_entry_WC', 'winner_is_seeded',
       'loser_is_seeded', 'tournament_id'],
      dty

In [70]:
columns_to_modify = ['W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2', 'L3', 'L4', 'L5'] 

df[columns_to_modify] = df[columns_to_modify].map(lambda x: str(x).strip()[0] if pd.notna(x) and str(x).strip() else x)

In [71]:
df['minutes'] = df['minutes'].apply(
    lambda x: x if x == 0 else int(x.split(':')[0]) * 60 + int(x.split(':')[1])
)

In [72]:
df

Unnamed: 0,url,tournament_location,tournament_country,Surface,indoor_or_outdoor,Round,Date,time,Comment,winner_name,...,tournament_name,winner_entry_LL,winner_entry_Q,winner_entry_WC,loser_entry_LL,loser_entry_Q,loser_entry_WC,winner_is_seeded,loser_is_seeded,tournament_id
0,https://www.livesport.com/en/match/hrB0HLYB/#/...,Brisbane,AUSTRALIA,HARD,outdoor,FINAL,2024-01-07,07:55,finished,Dimitrov G.,...,Brisbane International,False,False,False,False,False,False,1,1,Brisbane_2024
1,https://www.livesport.com/en/match/rulXp2l4/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,11:40,finished,Dimitrov G.,...,Brisbane International,False,False,False,False,False,False,1,0,Brisbane_2024
2,https://www.livesport.com/en/match/nBebMPXT/#/...,Brisbane,AUSTRALIA,HARD,outdoor,SEMI-FINALS,2024-01-06,06:05,finished,Rune H.,...,Brisbane International,False,False,False,False,False,False,1,0,Brisbane_2024
3,https://www.livesport.com/en/match/8tKKk33i/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,11:35,finished,Thompson J.,...,Brisbane International,False,False,False,False,False,False,0,0,Brisbane_2024
4,https://www.livesport.com/en/match/vaNo5gB0/#/...,Brisbane,AUSTRALIA,HARD,outdoor,QUARTER-FINALS,2024-01-05,08:10,finished,Dimitrov G.,...,Brisbane International,False,False,False,False,False,False,1,0,Brisbane_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,https://www.livesport.com/en/match/S2xWCS4T/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,06:40,finished,Hijikata R.,...,Hangzhou Open,False,False,False,False,False,False,0,0,Hangzhou_2024
2684,https://www.livesport.com/en/match/hpOm0kLN/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-19,05:40,finished,Nishioka Y.,...,Hangzhou Open,False,False,False,False,False,False,1,0,Hangzhou_2024
2685,https://www.livesport.com/en/match/WzqNElzH/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,13:40,finished,Cilic M.,...,Hangzhou Open,False,False,True,False,False,False,0,0,Hangzhou_2024
2686,https://www.livesport.com/en/match/vs6lV9Kc/#/...,Hangzhou,CHINA,HARD,outdoor,1/16-FINALS,2024-09-18,10:40,finished,Kukushkin M.,...,Hangzhou Open,False,False,False,False,False,False,0,1,Hangzhou_2024


In [73]:
pd.set_option('display.max_rows', None)
print(df.dtypes)

url                      object
tournament_location      object
tournament_country       object
Surface                  object
indoor_or_outdoor        object
Round                    object
Date                     object
time                     object
Comment                  object
winner_name              object
winner_rank              object
loser_name               object
loser_rank               object
Wsets                   float64
Lsets                   float64
minutes                   int64
W1                       object
W2                       object
L1                       object
L2                       object
w_ace                    object
l_ace                    object
w_df                     object
l_df                     object
W3                       object
L3                       object
W4                       object
W5                       object
L4                       object
L5                       object
loser_id                 object
winner_i

In [74]:
pd.set_option('display.max_rows', None)
print(cleaned_matches.dtypes)

tournament_location     object
tournament_name         object
Date                    object
indoor_or_outdoor       object
Surface                 object
Round                   object
W1                     float64
L1                     float64
W2                     float64
L2                     float64
W3                     float64
L3                     float64
W4                     float64
L4                     float64
W5                     float64
L5                     float64
Wsets                  float64
Lsets                  float64
Comment                 object
AvgW                   float64
AvgL                   float64
loser_id                 int64
winner_id                int64
match_id                object
tournament_id           object
draw_size                int64
tournament_level        object
tournament_date         object
winner_name             object
winner_hand             object
winner_ht              float64
winner_ioc              object
winner_a

In [75]:
for column in df.columns:
    if column in cleaned_matches.columns:
        target_dtype = cleaned_matches[column].dtype
        try:
            df[column] = df[column].astype(target_dtype)
        except ValueError as e:
            print(f"Could not convert column {column} to type {target_dtype}: {e}")

In [77]:
df['Date'] = pd.to_datetime(df['Date'])

In [78]:
round_mapping = {
    '1/64-FINALS': '1st Round' , 
    '1/32-FINALS': '2nd Round',
    '1/16-FINALS': '3rd Round',
    '1/8-FINALS': '4th Round', 
    'QUARTER-FINALS': 'Quarterfinals', 
    'SEMI-FINALS': 'Semifinals',
    'FINAL': 'The Final'
}

In [79]:
df['Round'] = df['Round'].replace(round_mapping)

In [80]:
df = df.sort_values(by='Date', ascending=True)

In [81]:
df

Unnamed: 0,url,tournament_location,tournament_country,Surface,indoor_or_outdoor,Round,Date,time,Comment,winner_name,...,tournament_name,winner_entry_LL,winner_entry_Q,winner_entry_WC,loser_entry_LL,loser_entry_Q,loser_entry_WC,winner_is_seeded,loser_is_seeded,tournament_id
30,https://www.livesport.com/en/match/rwFlQphr/#/...,Brisbane,AUSTRALIA,HARD,outdoor,3rd Round,2023-12-31,06:00,finished,Popyrin A.,...,Brisbane International,False,False,False,False,False,False,0,0,Brisbane_2024
29,https://www.livesport.com/en/match/GjDdOOMf/#/...,Brisbane,AUSTRALIA,HARD,outdoor,3rd Round,2023-12-31,06:40,finished,Shevchenko A.,...,Brisbane International,False,False,False,False,False,False,0,0,Brisbane_2024
27,https://www.livesport.com/en/match/CI65M2i7/#/...,Brisbane,AUSTRALIA,HARD,outdoor,3rd Round,2024-01-01,07:10,finished,Rune H.,...,Brisbane International,False,False,False,False,False,False,1,0,Brisbane_2024
56,https://www.livesport.com/en/match/Y37NMQ3t/#/...,Hong Kong,HONG KONG,HARD,outdoor,3rd Round,2024-01-01,09:35,finished,Kotov P.,...,Hong Kong Tennis Open,False,False,False,False,False,False,0,0,Hong Kong_2024
57,https://www.livesport.com/en/match/tUAVK4Yh/#/...,Hong Kong,HONG KONG,HARD,outdoor,3rd Round,2024-01-01,07:10,finished,Shang J.,...,Hong Kong Tennis Open,False,False,True,False,False,False,0,1,Hong Kong_2024
28,https://www.livesport.com/en/match/IFceqQbK/#/...,Brisbane,AUSTRALIA,HARD,outdoor,3rd Round,2024-01-01,04:20,finished,Safiullin R.,...,Brisbane International,False,False,False,False,False,False,0,1,Brisbane_2024
54,https://www.livesport.com/en/match/zT3vJr35/#/...,Hong Kong,HONG KONG,HARD,outdoor,3rd Round,2024-01-01,13:15,finished,Van De Zandschulp B.,...,Hong Kong Tennis Open,False,False,False,False,False,True,0,0,Hong Kong_2024
26,https://www.livesport.com/en/match/vPbip6qE/#/...,Brisbane,AUSTRALIA,HARD,outdoor,3rd Round,2024-01-01,09:50,finished,Dimitrov G.,...,Brisbane International,False,False,False,False,False,False,1,0,Brisbane_2024
55,https://www.livesport.com/en/match/Oh0zKOma/#/...,Hong Kong,HONG KONG,HARD,outdoor,3rd Round,2024-01-01,10:20,finished,Ruusuvuori E.,...,Hong Kong Tennis Open,False,False,False,False,False,False,0,0,Hong Kong_2024
52,https://www.livesport.com/en/match/Cjl0PTN8/#/...,Hong Kong,HONG KONG,HARD,outdoor,3rd Round,2024-01-02,07:05,finished,Kecmanovic M.,...,Hong Kong Tennis Open,False,False,False,False,True,False,0,0,Hong Kong_2024


In [82]:
missing_values = cleaned_matches.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
columns_with_nan

Columns with NaN values and their counts:


W3            6832
L3            6832
W4           12171
L4           12171
W5           13119
L5           13119
AvgW            12
AvgL            12
winner_ht       37
loser_ht        99
w_ace          100
w_df           100
w_svpt         100
w_1stIn        100
w_1stWon       100
w_2ndWon       100
w_SvGms         99
w_bpSaved      100
w_bpFaced      100
l_ace          100
l_df           100
l_svpt         100
l_1stIn        100
l_1stWon       100
l_2ndWon       100
l_SvGms         99
l_bpSaved      100
l_bpFaced      100
dtype: int64

In [83]:
df.to_csv("../data/matches_2024.csv", index=False)