In [1]:
import json
import pandas as pd

In [2]:
matches = pd.read_csv("../data/all_years_with_new_columns/matches.csv")

In [3]:
scraped_time_json = {}
for year in range(2018, 2024):
    with open(f"../data/tennis_explorer_scraped/scraped_match_time_{year}.json") as f:
        data = json.load(f)
        scraped_time_json.update(data)

### Zrobimy player_dict żeby można było dopisać player_id z matches df do scraped_time_json na podstawie nazw zawodników

In [4]:
winner_dict = dict(zip(matches['winner_name'], matches['winner_id']))
loser_dict = dict(zip(matches['loser_name'], matches['loser_id']))

# Combine the two dictionaries, with winner_dict taking priority
player_dict = {**loser_dict, **winner_dict}
player_dict

{'Richard Gasquet': 104755,
 'Daniel Elahi Galan': 123755,
 'Christopher Oconnell': 106331,
 'Holger Rune': 208029,
 'Felix Auger Aliassime': 200000,
 'Jordan Thompson': 111442,
 'Mikael Ymer': 144707,
 'Soon Woo Kwon': 126952,
 'Maxime Cressy': 202195,
 'Chun Hsin Tseng': 202358,
 'Bernabe Zapata Miralles': 126523,
 'Manas Dhamne': 211768,
 'Jaume Munar': 144719,
 'Alex Molcan': 144684,
 'Nikoloz Basilashvili': 105932,
 'Sumit Nagal': 111576,
 'Lorenzo Sonego': 132283,
 'Pedro Cachin': 106398,
 'Constant Lestienne': 106005,
 'Kyle Edmund': 106378,
 'Rinky Hijikata': 208014,
 'Andrey Rublev': 126094,
 'Andy Murray': 104918,
 'Radu Albot': 105430,
 'Elias Ymer': 111200,
 'Ramkumar Ramanathan': 106368,
 'Pablo Andujar': 104665,
 'Sasi Kumar Mukund': 124154,
 'Jack Draper': 207733,
 'Mackenzie Mcdonald': 111456,
 'Miomir Kecmanovic': 200175,
 'Marcos Giron': 106218,
 'Laslo Djere': 111513,
 'Tim Van Rijthoven': 126646,
 'Sebastian Baez': 202104,
 'Marco Cecchinato': 106065,
 'Flavio Cobol

### Zauważmy że jedynie Jo-Wilfried Tsonga ma myślnik w nazwie, a za to wiele zawodników w scraped_time_json ma myślinik, wykorzystamy to

In [5]:
for player in player_dict:
    if "-" in player:
        print(player)

Jo-Wilfried Tsonga


### Często nazwy tych samych zawodników między dwoma zbiorami różnią się kolejnością członów lub duża/wielka litera, dlatego uwzględniamy to przy matchowaniu nazw

In [6]:
def normalize_string(s):
    return " ".join(sorted(s.lower().split()))

# Normalize the dictionary keys
normalized_player_dict = {normalize_string(key): value for key, value in player_dict.items()}

def get_player_id(player_name):
    normalized_player_name = normalize_string(player_name)
    return normalized_player_dict.get(normalized_player_name)

### Niektóre nazwy trzeba ręcznie poprawić aby się zgadzały

In [7]:
def fix_player_names_json(name: str):
    if name == "Ramos-Vinolas Albert":
        return "Ramos Albert"
    if name == "O'Connell Christopher":
        return "Oconnell Christopher"
    if name == "Barrios Vera Marcelo Tomas":
        return "Barrios Vera Tomas"
    if name == "Wolf Jeffrey John":
        return "J J Wolf"
    if name == "Meligeni Rodrigues Alves Felipe":
        return "Meligeni Alves Felipe"
    if name == "Rodriguez Taverna Santiago":
        return "Rodriguez Taverna Santiago Fa"
    if name == "Granollers-Pujol Marcel":
        return "Granollers Marcel"
    if name == "Statham Rubin":
        return "Statham Jose Rubin"
    if name == "Skugor Franco":
        return "Skugor Franko"
    if name == "Gromley Colton":
        return "Gromley Cole"
    if name == "Ali Mutawa Jabor Mohammed":
        return "Jabor Al Mutawa"
    if name == "Dolgopolov Aleksandr":
        return "Dolgopolov Alexandr"
    if name == "Hernandez-Fernandez Jose":
        return "Hernandez Jose"
    if name != "Tsonga Jo-Wilfried" and "-" in name:
        return name.replace("-", " ")
    return name

### Sprawdzamy czy dla każdej nazwy zawodnika w scraped_time_json możemy wyciągnąć player_id z matches df

In [8]:
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['Player1'] = fix_player_names_json(match['Player1'])
                match['Player2'] = fix_player_names_json(match['Player2'])
                if get_player_id(match['Player1']) is None:
                    print(match['Player1'])
                if get_player_id(match['Player2']) is None:
                    print(match['Player2'])

### Dopisujemy zatem player_id

In [9]:
for year in scraped_time_json: 
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['Player1Id'] = get_player_id(match['Player1'])
                match['Player2Id'] = get_player_id(match['Player2'])

### Chcemy teraz osiągnąć taką samą lokacje turnieju w scraped_time_json co tourney_location w matches df

In [10]:
for year in scraped_time_json:
    matches_df = matches[matches['match_id'].str.split("_").str[1] == year]
    matches_df_locations = set(matches_df['tournament_location'].unique())
    json_locations = []
    for tournament in scraped_time_json[year]:
        json_locations.append(tournament)
        
    only_in_one_df = matches_df_locations ^ set(json_locations)
    if only_in_one_df:
        print(f"Locations only in one df: {only_in_one_df}")
    else:
        print("Both dfs  have the same tournament locations.")

Locations only in one df: {'Melbourne', 'Australian Open', 'Tokyo (Japan Open)', "'s-Hertogenbosch", "Queen's Club", 'Kitzbuhel', 'Queens Club', 'Tokyo', 'French Open', 'Winston-Salem', 'Hertogenbosch', 'London', 'US Open', 'St. Petersburg', 'KitzbĂĽhel', 'Wimbledon', 'Paris 2', 'Winston Salem', 'Saint Petersburg'}
Locations only in one df: {'Melbourne', 'Australian Open', 'Tokyo (Japan Open)', "'s-Hertogenbosch", "Queen's Club", 'Kitzbuhel', 'Queens Club', 'Tokyo', 'French Open', 'Winston-Salem', 'Hertogenbosch', 'London', 'US Open', 'St. Petersburg', 'KitzbĂĽhel', 'Wimbledon', 'Paris 2', 'Winston Salem', 'Saint Petersburg'}
Locations only in one df: {'St. Petersburg', 'Astana', 'KitzbĂĽhel', 'French Open', 'Cincinnati', 'Nur-Sultan', 'Paris 2', 'Cincinnati Masters (New York)', 'Melbourne', 'Australian Open', 'Saint Petersburg', 'Kitzbuhel', 'US Open'}
Locations only in one df: {'French Open', 'Winston-Salem', 'St. Petersburg', 'Astana', 'KitzbĂĽhel', 'Nur-Sultan', 'Paris 2', "Queen's

### Musimy ręczenie poprawić

In [11]:
def fix_tournament_name(name):
    if name == "Queen's Club":
        return "Queens Club"
    if name == "Kitzbühel":
        return "Kitzbuhel"
    if name == "KitzbĂĽhel": # Z jakiegoś powodu ü może się w to zmienić
        return "Kitzbuhel"
    if name == "Winston Salem":
        return "Winston-Salem"
    if name == "Astana":
        return "Nur-Sultan"
    if name == "Saint Petersburg":
        return "St. Petersburg"
    if name == "US Open":
        return "New York"
    if name == "Australian Open":
        return "Melbourne"
    if name == "Wimbledon":
        return "London"
    if name == "French Open":
        return "Paris"
    if name == "Paris":
        return "Paris 2"
    if name == "Tokyo (Japan Open)":
        return "Tokyo"
    if name == "Napoli":
        return "Naples"
    if name == "Hertogenbosch":
        return "'s-Hertogenbosch"
    if name == "Cincinnati Masters (New York)":
        return "Cincinnati"
    
    return name

In [12]:
for year in scraped_time_json:
    matches_df = matches[matches['match_id'].str.split("_").str[1] == year]
    matches_df_locations = set(matches_df['tournament_location'].unique())
    json_locations = []
    for tournament in scraped_time_json[year]:
        tournament = fix_tournament_name(tournament)
        json_locations.append(tournament)
        
    only_in_one_df = matches_df_locations ^ set(json_locations)
    if only_in_one_df:
        print(f"Locations only in one df: {only_in_one_df}")
    else:
        print("Both dfs have the same tournament locations.")

Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.
Both dfs have the same tournament locations.


### Teraz już jest ok, zatem możemy w scraped_time_json stworzyć match_id

In [13]:
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match['match_id'] = fix_tournament_name(tournament) + '_' + year + '_' + str(match['Player1Id']) + '_' + str(match['Player2Id'])

### Sprawdźmy czy stworzone match_id jest unikatowe

In [14]:
match_ids = []
for year in scraped_time_json:
    for tournament in scraped_time_json[year]:
        for round in scraped_time_json[year][tournament]:
            for match in scraped_time_json[year][tournament][round]:
                match_ids.append(match["match_id"])
print(len(match_ids) == len(set(match_ids)))

True


### Sprawdźmy teraz czy mamy 1 do 1 relacje między match_id z dwóch zbiorów

In [15]:
for match_id in match_ids:
    if match_id not in matches['match_id'].values:
        print(match_id)

In [16]:
for match_id in matches['match_id'].values:
    if match_id not in match_ids:
        print(match_id)

## Super, możemy zatem mergeować

In [17]:
# Flatten the scraped_time_json dictionary into a mapping of match_id to Time
time_mapping = {
    match['match_id']: {'Time': match['Time']}
    for year in scraped_time_json.values()
    for tournament in year.values()
    for round_info in tournament.values()
    for match in round_info
}

# Map Time to the matches df
matches['time'] = matches['match_id'].map(lambda match_id: time_mapping.get(match_id, {}).get('Time'))

matches

Unnamed: 0,tournament_location,tournament_name,Date,indoor_or_outdoor,Surface,Round,W1,L1,W2,L2,...,loser_bp_saved_pct,winner_total_1st_serve_in_pct,winner_total_1st_serve_win_pct,winner_total_2nd_serve_in_pct,winner_total_2nd_serve_win_pct,loser_total_1st_serve_in_pct,loser_total_1st_serve_win_pct,loser_total_2nd_serve_in_pct,loser_total_2nd_serve_win_pct,time
0,Adelaide 1,Adelaide International 1,2023-01-01,Outdoor,Hard,1st Round,7.0,6.0,6.0,7.0,...,0.833333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,06:25
1,Adelaide 1,Adelaide International 1,2023-01-01,Outdoor,Hard,1st Round,6.0,3.0,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,09:40
2,Adelaide 1,Adelaide International 1,2023-01-02,Outdoor,Hard,1st Round,6.0,4.0,6.0,4.0,...,0.400000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,01:40
3,Adelaide 1,Adelaide International 1,2023-01-02,Outdoor,Hard,1st Round,2.0,6.0,6.0,4.0,...,0.636364,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,05:15
4,Adelaide 1,Adelaide International 1,2023-01-02,Outdoor,Hard,1st Round,6.0,4.0,7.0,6.0,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,06:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Paris 2,BNP Paribas Masters,2018-11-02,Indoor,Hard,Quarterfinals,4.0,6.0,6.0,2.0,...,0.692308,0.588235,0.733333,0.928571,0.589744,0.611111,0.818182,0.938776,0.521739,19:45
14107,Paris 2,BNP Paribas Masters,2018-11-02,Indoor,Hard,Quarterfinals,6.0,4.0,6.0,4.0,...,0.333333,0.500000,0.814815,0.925926,0.640000,0.586207,0.808824,0.916667,0.750000,22:15
14108,Paris 2,BNP Paribas Masters,2018-11-03,Indoor,Hard,Semifinals,6.0,4.0,6.0,1.0,...,0.375000,0.625571,0.854015,0.939024,0.662338,0.549180,0.753731,0.881818,0.608247,14:15
14109,Paris 2,BNP Paribas Masters,2018-11-03,Indoor,Hard,Semifinals,7.0,6.0,5.0,7.0,...,1.000000,0.609626,0.710526,0.890411,0.646154,0.509091,0.892857,0.925926,0.640000,16:45


### Sprawdźmy jeszcze czy nie ma braków wartości w nowej kolumnie

In [18]:
nan_count = matches['time'].isna().sum()

print(f"Number of NaN values in column 'time': {nan_count}")

Number of NaN values in column 'time': 0


In [19]:
matches.to_csv("../data/all_years_with_new_columns_and_time/matches.csv", index=False)