In [1]:
import pandas as pd

In [2]:
try:
    years   # noqa
except NameError:
    # Define default values for manual execution
    input_directory = "../data/raw_data/"
    output_directory = "../data/Processed_data/"
    years = [2021, 2022, 2023]
    print("Parameters not provided, using default values:")
    print(f"input_directory: {input_directory}, years: {years}")

Parameters not provided, using default values:
input_directory: ../data/raw_data/, years: [2021, 2022, 2023]


## Wczytanie danych bets

In [3]:
bets: dict[str: pd.DataFrame] = {}
for year in years:
    file_name = f"{input_directory}{year}.xlsx"   # noqa
    bets[str(year)] = pd.read_excel(file_name)
    print(f"Loaded bets_{year} from {file_name}")

Loaded bets_2021 from ../data/raw_data/2021.xlsx
Loaded bets_2022 from ../data/raw_data/2022.xlsx
Loaded bets_2023 from ../data/raw_data/2023.xlsx


In [4]:
for year in bets:
    bets[year].loc[bets[year]["Tournament"] == "Adelaide International 1", "Location"] = "Adelaide 1"
    bets[year].loc[bets[year]["Tournament"] == "Adelaide International 2", "Location"] = "Adelaide 2"
    bets[year]["Location"] = bets[year]["Location"].replace({'Dubai ': 'Dubai', 'Belgrade ': 'Belgrade', 'Napoli':'Naples'})
    bets[year] = bets[year][~bets[year]['Location'].isin(['Turin'])]
    bets[year]['Loser'] = bets[year]['Loser'].replace({"Varillas J. P.": "Varillas J.P.", "Tseng C. H.": "Tseng C.H."})
    bets[year]['Winner'] = bets[year]['Winner'].replace({"Varillas J. P.": "Varillas J.P.", "Tseng C. H.": "Tseng C.H."})
    bets[year] = bets[year][~bets[year]['Tournament'].isin(['United Cup', 'Tour Finals', 'NextGen Finals','Tokyo Olympics','Atp Cup', 'Laver Cup', 'Melbourne Summer Set', 'Great Ocean Road Open', 'Murray River Open'])]
    bets[year].loc[bets[year]['Tournament']== 'BNP Paribas Masters', 'Location']='Paris 2'
    bets[year].loc[bets[year]['Tournament']== 'Belgrade Open', 'Location']='Belgrade 2'

In [5]:
for year in bets:
    bets[year].rename(columns={'Location': 'tourney_location'}, inplace=True)
    bets[year].reset_index(drop=True, inplace=True)

In [6]:
for year in bets:
    print(f"Year {year}: {bets[year].columns}")

Year 2021: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL',
       'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')
Year 2022: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL',
       'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')
Year 2023: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', '

## Sprawdzenie czy na pewno te same kolumny

In [7]:
from test_helpers.preprocessing import check_columns_same
check_columns_same(bets)

✅ Year 2022: Columns match with the first year (2021).
✅ Year 2023: Columns match with the first year (2021).


## Wczytanie danych github

In [8]:
github: dict[str: pd.DataFrame] = {}
for year in years:
    file_name = f"{input_directory}atp_matches_{year}.csv"
    github[str(year)] = pd.read_csv(file_name)
    print(f"Loaded github_{year} from {file_name}")

Loaded github_2021 from ../data/raw_data/atp_matches_2021.csv
Loaded github_2022 from ../data/raw_data/atp_matches_2022.csv
Loaded github_2023 from ../data/raw_data/atp_matches_2023.csv


## Sprawdzenie czy zbiory mają takie same kolumny

In [9]:
check_columns_same(github)

✅ Year 2022: Columns match with the first year (2021).
✅ Year 2023: Columns match with the first year (2021).


### Zgodnie z założeniami usuwamy Finals, NextGen Finals, United Cup oraz mecze Davis Cup

In [10]:
for year in github:
    github[year]=github[year][~github[year]['tourney_name'].str.contains('Davis Cup', na=False)]
    github[year] = github[year][~github[year]['tourney_name'].isin(['Laver Cup', 'United Cup', 'Tour Finals', 'NextGen Finals','Tokyo Olympics','Atp Cup','Melbourne Summer Set', 'Melbourne', 'Great Ocean Road Open', 'Murray River Open'])]
    github[year]["tourney_name"] = github[year]["tourney_name"].replace({'Belgrade ': 'Belgrade'})

### W tej ramce kolumna 'tourney_name' to dla większości turniejów lokalizacja turnieju a nie jego nazwa, a potrzebujemy żeby nazwa turnieju była taka jak w ramce bets. Dla turniejów, dla których 'tourney_name' to nie lokalizacja, ręcznie wpisujemy lokalizację

In [11]:
for year in github:
    github[year].rename(columns={'tourney_name': 'tourney_location'}, inplace=True)
    github[year]['tourney_location'] = github[year]['tourney_location'].replace({"Australian Open": "Melbourne", "Indian Wells Masters": "Indian Wells", "Miami Masters": "Miami", "Monte Carlo Masters": "Monte Carlo", "Madrid Masters": "Madrid", "Rome Masters": "Rome", "Roland Garros": "Paris", "s Hertogenbosch": "'s-Hertogenbosch", "Queen's Club": "Queens Club", "Wimbledon": "London", "Cincinnati Masters": "Cincinnati", "Us Open": "New York", "Astana": "Nur-Sultan", "Shanghai Masters": "Shanghai", "Paris Masters": "Paris 2", "Rio De Janeiro": "Rio de Janeiro"})

### Turniej Canada Masters jest w parzystych latach w Montrealu i w nieparzystych w Toronto

In [12]:
for year in github:
    if int(year) % 2 == 0:
        github[year]['tourney_location'] = github[year]['tourney_location'].replace({"Canada Masters": "Montreal"})
    else:
        github[year]['tourney_location'] = github[year]['tourney_location'].replace({"Canada Masters": "Toronto"})

## Po sprawdzeniu danych historycznych wykryliśmy błędy

In [13]:
github['2023']['loser_name'] = github['2023']['loser_name'].replace('Eduardo Nava', 'Emilio Nava')
github['2023']['loser_id'] = github['2023']['loser_id'].replace(124013, 207182)
github['2023'].loc[github['2023']['loser_id'] == 212021, 'loser_hand'] = 'R'
github['2023'].loc[github['2023']['loser_id'] == 212021, 'loser_ht'] = 191
github['2023'].loc[github['2023']['loser_id'] == 212021, 'loser_id'] = 211776

### Sprawdzamy, że utworzona przez nas kolumnna 'tourney_location' odpowiada kolumnie 'tourney_location' w ramce bets

In [14]:
from test_helpers.preprocessing import compare_tourney_locations

compare_tourney_locations(github, bets, "github", "bets")

✅ 'tourney_location' column matches for year 2021.
✅ 'tourney_location' column matches for year 2022.
✅ 'tourney_location' column matches for year 2023.


### Potrzebujemy mieć takie same nazwy zawodników w obu ramkach, aby dodać do ramki bets id zawodnika z ramki github. Aktualnie w ramce bets nazwa zawodnika jest w formacie typu Djokovic N. a w ramce github w formacie typu Novak Djokovic, dlatego musimy stworzyć nazwy zawodników w skróconym w formacie w ramce github.

In [15]:
def transform_name(name: str) -> str:
    name_parts = name.split()
    first_name = name_parts[0]
    last_name = ' '.join(name_parts[1:])
    return f"{last_name} {first_name[0]}."

In [16]:
for year in github:
    github[year]['shortened_winner_name']=github[year]['winner_name'].apply(transform_name)
    github[year]['shortened_loser_name']=github[year]['loser_name'].apply(transform_name)

### Sprawdzamy czy skrócona nazwa zawodnika jednoznacznie określa zawodnika

In [17]:
from test_helpers.preprocessing import check_one_to_one_mapping

check_one_to_one_mapping(github, column_pairs = [('shortened_winner_name', 'winner_id'), ('shortened_loser_name', 'loser_id')], raise_error=False)

✅ Year 2021: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.

            ❌ One-to-one mapping violation between 'shortened_loser_name' and 'loser_id' in year 2021:
            - Unique 'shortened_loser_name': 300
            - Unique 'loser_id': 301
            This suggests that some 'shortened_loser_name' values map to multiple 'loser_id' values or vice versa.
            
✅ Year 2022: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.

            ❌ One-to-one mapping violation between 'shortened_loser_name' and 'loser_id' in year 2022:
            - Unique 'shortened_loser_name': 304
            - Unique 'loser_id': 305
            This suggests that some 'shortened_loser_name' values map to multiple 'loser_id' values or vice versa.
            
✅ Year 2023: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.
✅ Year 2023: One-to-one mapping between 'shortened_loser_name' and 'loser_id' is valid.


## Zobaczmy, którzy zawodnicy nie są jednoznacznie określeni przez skróconą nazwę

In [18]:
non_unique_losers={}
for year in github:
    loser_groups = github[year].groupby('shortened_loser_name')['loser_id'].nunique()
    non_unique_loser_names = loser_groups[loser_groups > 1].index.tolist()
    non_unique_losers[year] = github[year][github[year]['shortened_loser_name'].isin(non_unique_loser_names)][['shortened_loser_name', 'loser_id']].drop_duplicates()

for i in range(2021,2024):
    if not non_unique_losers[str(i)].empty:
        print(f"Lata {i} - Przegrani z niejednoznacznymi skróconymi nazwami:")
        print(non_unique_losers[str(i)])

Lata 2021 - Przegrani z niejednoznacznymi skróconymi nazwami:
     shortened_loser_name  loser_id
448               Nava E.    207182
1976              Nava E.    124013
Lata 2022 - Przegrani z niejednoznacznymi skróconymi nazwami:
     shortened_loser_name  loser_id
310             Martin A.    105413
1793            Martin A.    211346


105413 - Martin Andrej -> Martin A.
211346 - Martin Andres -> Martin An.

## W celu rozróżnienia zawodników do skrótu imienia jednego z nich dopsujemy drugą literkę imienia

In [19]:
for year in github:
    github[year].loc[github[year]['loser_id'] == 124013, 'shortened_loser_name'] = 'Nava Ed.'
    github[year].loc[github[year]['loser_id'] == 211346, 'shortened_loser_name'] = 'Martin An.'
    github[year].loc[github[year]['winner_id'] == 211346, 'shortened_winner_name'] = 'Martin An.'

## Od razu musimy też wykonać tą samę zmianę w ramce bets

In [20]:
bets["2023"].loc[(bets["2023"]['Tournament'] == 'Atlanta Open') & (bets["2023"]['Winner'] == 'Martin A.'), 'Winner'] = 'Martin An.'
bets["2023"].loc[(bets["2023"]['Tournament'] == 'Atlanta Open') & (bets["2023"]['Loser'] == 'Martin A.'), 'Loser'] = 'Martin An.'
bets["2022"].loc[(bets["2022"]['Tournament'] == 'Atlanta Open') & (bets["2022"]['Winner'] == 'Martin A.'), 'Winner'] = 'Martin An.'
bets["2022"].loc[(bets["2022"]['Tournament'] == 'Atlanta Open') & (bets["2022"]['Loser'] == 'Martin A.'), 'Loser'] = 'Martin An.'
bets["2021"].loc[(bets["2021"]['tourney_location'] == 'Winston-Salem') & (bets["2021"]['Loser'] == 'Nava E.'), 'Loser'] = 'Nava Ed.'

In [21]:
check_one_to_one_mapping(github, column_pairs = [('shortened_winner_name', 'winner_id'), ('shortened_loser_name', 'loser_id')], raise_error=True)

✅ Year 2021: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.
✅ Year 2021: One-to-one mapping between 'shortened_loser_name' and 'loser_id' is valid.
✅ Year 2022: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.
✅ Year 2022: One-to-one mapping between 'shortened_loser_name' and 'loser_id' is valid.
✅ Year 2023: One-to-one mapping between 'shortened_winner_name' and 'winner_id' is valid.
✅ Year 2023: One-to-one mapping between 'shortened_loser_name' and 'loser_id' is valid.


### Niektóre imiona i nazwiska ze względu na swoją unikalność lub brak konsekwencji w zapisie musimy zmodyfikować ręcznie

In [22]:
github['2023']['shortened_loser_name'] = github['2023']['shortened_loser_name'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2022']['shortened_loser_name'] = github['2022']['shortened_loser_name'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2021']['shortened_loser_name'] = github['2021']['shortened_loser_name'].replace({"Patrick Smith J.":"Smith J.P.","Shannan Zayid M.":"Zayid M.","Hsun Lu Y.":"Lu Y.","Aragone J.":"Aragone J.C.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})

In [23]:
github['2023']['shortened_winner_name'] = github['2023']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J.P.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2022']['shortened_winner_name'] = github['2022']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J.P.","Meligeni Alves F.":"Meligeni Rodrigues F","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2021']['shortened_winner_name'] = github['2021']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J.P.","Patrick Smith J.":"Smith J.P.","Shannan Zayid M.":"Zayid M.","Hsun Lu Y.":"Lu Y.","Aragone J.":"Aragone J.C.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C.H."
,"Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C.H."
, "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J.P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})

In [24]:
bets['2023']['Loser']=bets['2023']['Loser'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F"})

In [25]:
bets['2023']['Winner']=bets['2023']['Winner'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F"})

## Po poprawkach możemy każdemu zawodnikowi w ramce bets przypisać id

In [26]:
for year in github:
    map_player_name_player_id = github[year].groupby('shortened_loser_name', as_index=False)[['shortened_loser_name', 'loser_id']].first()
    mapping = dict(zip(map_player_name_player_id['shortened_loser_name'], map_player_name_player_id['loser_id']))
    bets[year]['loser_id'] = bets[year]['Loser'].map(mapping)
    

In [27]:
for year in github:
    map_player_name_player_id = github[year].groupby('shortened_winner_name', as_index=False)[['shortened_winner_name', 'winner_id']].first()
    mapping = dict(zip(map_player_name_player_id['shortened_winner_name'], map_player_name_player_id['winner_id']))
    bets[year]['winner_id'] = bets[year]['Winner'].map(mapping)

In [28]:
from test_helpers.preprocessing import check_missing_player_ids

check_missing_player_ids(bets)

✅ Year 2021: No missing 'loser_id' or 'winner_id' values found.
✅ Year 2022: No missing 'loser_id' or 'winner_id' values found.
✅ Year 2023: No missing 'loser_id' or 'winner_id' values found.


### Skoro mamy już wspólne id zawodnika, oraz id turnieju to możemy w obu ramkach zdefiniować wspólne id meczu. Sprawdzamy czy id meczu jest unikalne dla każdego wiersza w ramkach

In [29]:
for year in github:
    github[year]['match_id'] = github[year]['tourney_location'].astype(str) + '_' + year + '_' + github[year]['winner_id'].astype(str) + '_' + github[year]['loser_id'].astype(str)
    bets[year]['match_id'] = bets[year]['tourney_location'].astype(str) + '_' + year + '_' +  bets[year]['winner_id'].astype(str) + '_' + bets[year]['loser_id'].astype(str)

In [30]:
from test_helpers.preprocessing import check_match_id_uniqueness

check_match_id_uniqueness(github, "github")
check_match_id_uniqueness( bets, "bets")

✅ Year 2021: All 'match_id' values are unique in github dataset.
✅ Year 2022: All 'match_id' values are unique in github dataset.
✅ Year 2023: All 'match_id' values are unique in github dataset.
✅ Year 2021: All 'match_id' values are unique in bets dataset.
✅ Year 2022: All 'match_id' values are unique in bets dataset.
✅ Year 2023: All 'match_id' values are unique in bets dataset.


### W bets 2023 jest źle wpisywany wynik meczu Bu Y. vs Kecmanovic M. w pierwszej rundzie turnieju w Shanghaju., wpisane jest zwycięstwo Kecmanovica a przegrał on ten mecz. Taka sama sytaacja wystąpiła w meczu 1/8 finału pomiędzy Kuzmanov D. a Carbales Baena R. Musimy zamienić ze soba wszystkie kolumny zawierajace wartości dedykowane zwycięzcy/przegranemu

In [31]:
def swap_winner_loser_bets(match_ids: [str], df: pd.DataFrame) -> pd.DataFrame:
    winner_cols = [col for col in df.columns if col.startswith('W') or col.endswith('W') or col == 'Winner' or col == "winner_id"]
    loser_cols = [col for col in df.columns if col.startswith('L') or col.endswith('L') or col == 'Loser' or col == "loser_id"]
    condition = df['match_id'].isin(match_ids)
    
    if not df[condition].empty:
        for w_col, l_col in zip(winner_cols, loser_cols):
            df.loc[condition, w_col], df.loc[condition, l_col] = df.loc[condition, l_col].values, df.loc[condition, w_col].values
    else:
        print(f"No match found with match_id {match_ids}")
    
    return df

In [32]:
bets['2023'] = swap_winner_loser_bets(["Marrakech_2023_106220_106148", "Shanghai_2023_200175_207352"], bets['2023'])
bets['2023']['match_id'] = bets['2023']['match_id'].replace({"Marrakech_2023_106220_106148": "Marrakech_2023_106148_106220", "Shanghai_2023_200175_207352": "Shanghai_2023_207352_200175"})

### Sprawdźmy jeszcze czy sa jakieś mecze, które sa tylko w jednej ramce.

In [31]:
from test_helpers.preprocessing import check_match_id_consistency

check_match_id_consistency(bets, github, 'bets', 'github')

✅ Year 2021: `match_id` values are consistent between bets and github.
✅ Year 2022: `match_id` values are consistent between bets and github.


ValueError: 
            ❌ Year 2023: Mismatch in `match_id` values between bets and github.
            - `match_id` values in bets but not in github: ['Marrakech_2023_106220_106148', 'Shanghai_2023_200175_207352']
            - `match_id` values in github but not in bets: ['Marrakech_2023_106148_106220', 'Shanghai_2023_207352_200175']
            

In [34]:
for year in years:
    bets[str(year)].to_csv(output_directory +f'bets{year}.csv', index=False)   # noqa
    github[str(year)].to_csv(output_directory +f'github{year}.csv', index=False)