In [1]:
import json
import pandas as pd

In [2]:
matches = pd.read_csv("../data/merged_matches.csv")

In [3]:
scraped_time_json = {}
for year in range(2018, 2024):
    with open(f"../../raw_data/tennis_explorer/scraped_match_time_{year}.json") as f:
        data = json.load(f)
        scraped_time_json.update(data)

### Zrobimy player_dict żeby można było dopisać player_id z matches df do scraped_time_json na podstawie nazw zawodników

In [4]:
winner_dict = dict(zip(matches['winner_name'], matches['winner_id']))
loser_dict = dict(zip(matches['loser_name'], matches['loser_id']))

player_dict = {**loser_dict, **winner_dict}
player_dict

{'Diego Schwartzman': 106043,
 'Steve Johnson': 105449,
 'Leonardo Mayer': 104919,
 'Frances Tiafoe': 126207,
 'John Patrick Smith': 105441,
 'Ernesto Escobedo': 124014,
 'Jordan Thompson': 111442,
 'Federico Delbonis': 105643,
 'Gilles Muller': 104180,
 'Denis Shapovalov': 133430,
 'Damir Dzumhur': 106000,
 'Peter Polansky': 105166,
 'Horacio Zeballos': 104547,
 'Matthew Ebden': 105051,
 'Mischa Zverev': 104999,
 'Milos Raonic': 105683,
 'Jared Donaldson': 111577,
 'Yannick Hanfmann': 105870,
 'Hyeon Chung': 111202,
 'John Millman': 105357,
 'Michael Mmoh': 111581,
 'Alexandr Dolgopolov': 105238,
 'Denis Istomin': 104797,
 'Kyle Edmund': 106378,
 'Alex De Minaur': 200282,
 'Grigor Dimitrov': 105777,
 'Ryan Harrison': 105992,
 'Cedrik Marcel Stebe': 105649,
 'Dudi Sela': 104534,
 'Evgeny Donskoy': 105539,
 'Thomas Fabbiano': 105341,
 'Florian Mayer': 104252,
 'Pablo Carreno Busta': 105807,
 'Andreas Haider Maurer': 104890,
 'Guillermo Garcia Lopez': 104198,
 'Filip Krajinovic': 105936,

### Zauważmy że jedynie Jo-Wilfried Tsonga ma myślnik w nazwie, a za to wiele zawodników w scraped_time_json ma myślinik, wykorzystamy to

In [5]:
for player in player_dict:
    if "-" in player:
        print(player)

Jo-Wilfried Tsonga


### Często nazwy tych samych zawodników między dwoma zbiorami różnią się kolejnością członów lub duża/wielka litera, dlatego uwzględniamy to przy matchowaniu nazw

In [6]:
def normalize_string(s):
    return " ".join(sorted(s.lower().split()))

normalized_player_dict = {normalize_string(key): value for key, value in player_dict.items()}

def get_player_id(player_name):
    normalized_player_name = normalize_string(player_name)
    return normalized_player_dict.get(normalized_player_name)

### Niektóre nazwy trzeba ręcznie poprawić aby się zgadzały

In [7]:
def fix_player_names_json(name: str):
    if name == "Ramos-Vinolas Albert":
        return "Ramos Albert"
    if name == "O'Connell Christopher":
        return "Oconnell Christopher"
    if name == "Barrios Vera Marcelo Tomas":
        return "Barrios Vera Tomas"
    if name == "Wolf Jeffrey John":
        return "J J Wolf"
    if name == "Meligeni Rodrigues Alves Felipe":
        return "Meligeni Alves Felipe"
    if name == "Rodriguez Taverna Santiago":
        return "Rodriguez Taverna Santiago Fa"
    if name == "Granollers-Pujol Marcel":
        return "Granollers Marcel"
    if name == "Statham Rubin":
        return "Statham Jose Rubin"
    if name == "Skugor Franco":
        return "Skugor Franko"
    if name == "Gromley Colton":
        return "Gromley Cole"
    if name == "Ali Mutawa Jabor Mohammed":
        return "Jabor Al Mutawa"
    if name == "Dolgopolov Aleksandr":
        return "Dolgopolov Alexandr"
    if name == "Hernandez-Fernandez Jose":
        return "Hernandez Jose"
    if name != "Tsonga Jo-Wilfried" and "-" in name:
        return name.replace("-", " ")
    return name

### Sprawdzamy czy dla każdej nazwy zawodnika w scraped_time_json możemy wyciągnąć player_id z matches df

In [8]:
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['Player1'] = fix_player_names_json(match['Player1'])
                match['Player2'] = fix_player_names_json(match['Player2'])
                if get_player_id(match['Player1']) is None:
                    print(match['Player1'])
                if get_player_id(match['Player2']) is None:
                    print(match['Player2'])

### Dopisujemy zatem player_id

In [9]:
for year in scraped_time_json: 
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['Player1Id'] = get_player_id(match['Player1'])
                match['Player2Id'] = get_player_id(match['Player2'])

### Chcemy teraz osiągnąć taką samą lokacje turnieju w scraped_time_json co tourney_location w matches df

In [10]:
for year in scraped_time_json:
    matches_df = matches[matches['match_id'].str.split("_").str[1] == year]
    matches_df_locations = set(matches_df['tournament_location'].unique())
    json_locations = []
    for tournament in scraped_time_json[year]:
        json_locations.append(tournament)
        
    only_in_one_df = matches_df_locations ^ set(json_locations)
    if only_in_one_df:
        print(f"Locations only in one df: {only_in_one_df}")
    else:
        print("Both dfs  have the same tournament locations.")

Locations only in one df: {'Winston-Salem', 'London', 'Winston Salem', 'Queens Club', 'Paris 2', 'US Open', "'s-Hertogenbosch", 'Hertogenbosch', 'Tokyo', 'French Open', 'St. Petersburg', 'KitzbĂĽhel', "Queen's Club", 'Australian Open', 'Wimbledon', 'Saint Petersburg', 'Melbourne', 'Tokyo (Japan Open)', 'Kitzbuhel'}
Locations only in one df: {'Winston-Salem', 'London', 'Winston Salem', 'Queens Club', 'Paris 2', 'US Open', "'s-Hertogenbosch", 'Hertogenbosch', 'Tokyo', 'French Open', 'St. Petersburg', 'KitzbĂĽhel', "Queen's Club", 'Australian Open', 'Wimbledon', 'Saint Petersburg', 'Melbourne', 'Tokyo (Japan Open)', 'Kitzbuhel'}
Locations only in one df: {'St. Petersburg', 'KitzbĂĽhel', 'Australian Open', 'Nur-Sultan', 'Kitzbuhel', 'Astana', 'Cincinnati Masters (New York)', 'Saint Petersburg', 'Cincinnati', 'Melbourne', 'French Open', 'Paris 2', 'US Open'}
Locations only in one df: {'Winston-Salem', 'Winston Salem', 'Astana', 'Queens Club', 'French Open', 'Paris 2', 'St. Petersburg', 'Kit

### Musimy ręczenie poprawić

In [11]:
def fix_tournament_name(name):
    if name == "Queen's Club":
        return "Queens Club"
    if name == "Kitzbühel":
        return "Kitzbuhel"
    if name == "KitzbĂĽhel": 
        return "Kitzbuhel"
    if name == "Winston Salem":
        return "Winston-Salem"
    if name == "Astana":
        return "Nur-Sultan"
    if name == "Saint Petersburg":
        return "St. Petersburg"
    if name == "US Open":
        return "New York"
    if name == "Australian Open":
        return "Melbourne"
    if name == "Wimbledon":
        return "London"
    if name == "French Open":
        return "Paris"
    if name == "Paris":
        return "Paris 2"
    if name == "Tokyo (Japan Open)":
        return "Tokyo"
    if name == "Napoli":
        return "Naples"
    if name == "Hertogenbosch":
        return "'s-Hertogenbosch"
    if name == "Cincinnati Masters (New York)":
        return "Cincinnati"
    
    return name

In [12]:
for year in scraped_time_json:
    matches_df = matches[matches['match_id'].str.split("_").str[1] == year]
    matches_df_locations = set(matches_df['tournament_location'].unique())
    json_locations = []
    for tournament in scraped_time_json[year]:
        tournament = fix_tournament_name(tournament)
        json_locations.append(tournament)
        
    only_in_one_df = matches_df_locations ^ set(json_locations)
    if only_in_one_df:
        print(f"Locations only in one df: {only_in_one_df}")
    else:
        print("Both dfs have the same tournament locations.")

Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.


### Teraz już jest ok, zatem możemy w scraped_time_json stworzyć match_id

In [13]:
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['match_id'] = fix_tournament_name(tournament) + '_' + year + '_' + str(match['Player1Id']) + '_' + str(match['Player2Id'])

### Sprawdźmy czy stworzone match_id jest unikatowe

In [14]:
match_ids = []
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match_ids.append(match["match_id"])
print(len(match_ids) == len(set(match_ids)))

True


### Sprawdźmy teraz czy mamy 1 do 1 relacje między match_id z dwóch zbiorów

In [15]:
for match_id in match_ids:
    if match_id not in matches['match_id'].values:
        print(match_id)

In [16]:
for match_id in matches['match_id'].values:
    if match_id not in match_ids:
        print(match_id)

## Super, możemy zatem mergeować

In [17]:

time_mapping = {
    match['match_id']: {'Time': match['Time']}
    for year in scraped_time_json.values()
    for tournament in year.values()
    for round_info in tournament.values()
    for match in round_info
}

matches['time'] = matches['match_id'].map(lambda match_id: time_mapping.get(match_id, {}).get('Time'))

matches

Unnamed: 0,tournament_location,tournament_name,Date,tournament_level,indoor_or_outdoor,Surface,Round,W1,L1,W2,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,time
0,Brisbane,Brisbane International,2017-12-31,ATP250,Outdoor,Hard,1st Round,6.0,4.0,7.0,...,22.0,20.0,11.0,1.0,3.0,38.0,1231.0,26.0,1675.0,07:25
1,Brisbane,Brisbane International,2017-12-31,ATP250,Outdoor,Hard,1st Round,7.0,6.0,6.0,...,33.0,17.0,11.0,4.0,6.0,208.0,245.0,44.0,1055.0,09:20
2,Brisbane,Brisbane International,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,4.0,3.0,...,33.0,19.0,14.0,1.0,4.0,47.0,1010.0,52.0,909.0,02:10
3,Brisbane,Brisbane International,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,3.0,6.0,...,18.0,6.0,8.0,1.0,5.0,76.0,670.0,79.0,662.0,04:00
4,Brisbane,Brisbane International,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,4.0,7.0,...,27.0,8.0,11.0,1.0,6.0,33.0,1302.0,218.0,235.0,07:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Sofia,Sofia Open,2023-11-09,ATP250,Indoor,Hard,Quarterfinals,6.0,7.0,6.0,...,47.0,14.0,15.0,11.0,16.0,25.0,1525.0,44.0,1019.0,17:00
14107,Sofia,Sofia Open,2023-11-09,ATP250,Indoor,Hard,Quarterfinals,6.0,3.0,7.0,...,22.0,19.0,10.0,0.0,2.0,80.0,721.0,47.0,957.0,20:05
14108,Sofia,Sofia Open,2023-11-10,ATP250,Indoor,Hard,Semifinals,6.0,3.0,6.0,...,22.0,10.0,9.0,1.0,4.0,82.0,706.0,28.0,1442.0,15:10
14109,Sofia,Sofia Open,2023-11-10,ATP250,Indoor,Hard,Semifinals,6.0,2.0,7.0,...,27.0,8.0,10.0,1.0,3.0,25.0,1525.0,80.0,721.0,17:00


### Sprawdźmy jeszcze czy nie ma braków wartości w nowej kolumnie

In [18]:
nan_count = matches['time'].isna().sum()

print(f"Number of NaN values in column 'time': {nan_count}")

Number of NaN values in column 'time': 0


In [19]:
matches.to_csv("../data/merged_matches_with_time.csv", index=False)