In [1]:
import pandas as pd

## Wczytanie danych bets

In [2]:
bets: dict[str: pd.DataFrame] = {}
for year in range(2021, 2024):
    file_name=f"../data/raw_data/{year}.xlsx"
    bets[str(year)] = pd.read_excel(file_name)
    print(f"Loaded bets_{year} from {file_name}")

Loaded bets_2021 from ../data/raw_data/2021.xlsx
Loaded bets_2022 from ../data/raw_data/2022.xlsx
Loaded bets_2023 from ../data/raw_data/2023.xlsx


In [3]:
for year in bets:
    bets[year].loc[bets[year]["Tournament"] == "Adelaide International 1", "Location"] = "Adelaide 1"
    bets[year].loc[bets[year]["Tournament"] == "Adelaide International 2", "Location"] = "Adelaide 2"
    bets[year]["Location"] = bets[year]["Location"].replace({'Dubai ': 'Dubai', 'Belgrade ': 'Belgrade', 'Napoli':'Naples', 'Montreal': 'Toronto'})
    bets[year] = bets[year][~bets[year]['Location'].isin(['Turin'])]
    bets[year]['Loser'] = bets[year]['Loser'].replace({"Varillas J. P.": "Varillas J.P.", "Tseng C. H.": "Tseng C.H."})
    bets[year]['Winner'] = bets[year]['Winner'].replace({"Varillas J. P.": "Varillas J.P.", "Tseng C. H.": "Tseng C.H."})
    bets[year] = bets[year][~bets[year]['Tournament'].isin(['United Cup', 'Tour Finals', 'NextGen Finals','Tokyo Olympics','Atp Cup', 'Laver Cup', 'Melbourne Summer Set', 'Great Ocean Road Open', 'Murray River Open'])]
    bets[year].loc[bets[year]['Tournament']== 'BNP Paribas Masters', 'Location']='Paris 2'
    bets[year].loc[bets[year]['Tournament']== 'Belgrade Open', 'Location']='Belgrade 2'

In [4]:
for year in bets:
    bets[year].rename(columns={'Location': 'tourney_location'}, inplace=True)
    bets[year].reset_index(drop=True, inplace=True)

In [5]:
for year in bets:
    print(f"Year {year}: {bets[year].columns}")

Year 2021: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL',
       'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')
Year 2022: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL',
       'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')
Year 2023: Index(['ATP', 'tourney_location', 'Tournament', 'Date', 'Series', 'Court',
       'Surface', 'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank',
       'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5',
       'L5', '

## Sprawdzenie czy na pewno te same kolumny

In [6]:
if set(bets["2021"].columns) == set(bets["2022"].columns) == set(bets["2023"].columns):
    print("All DataFrames have the same columns.")
else:
    print("DataFrames have different columns.")

All DataFrames have the same columns.


## Wczytanie danych github

In [7]:
github: dict[str: pd.DataFrame] = {}
for year in range(2021, 2024):
    file_name = f"../data/raw_data/atp_matches_{year}.csv"
    github[str(year)] = pd.read_csv(file_name)
    print(f"Loaded github_{year} from {file_name}")

Loaded github_2021 from ../data/raw_data/atp_matches_2021.csv
Loaded github_2022 from ../data/raw_data/atp_matches_2022.csv
Loaded github_2023 from ../data/raw_data/atp_matches_2023.csv


## Sprawdzenie czy zbiory mają takie same kolumny

In [8]:
if set(github["2021"].columns) == set(github["2022"].columns) == set(github["2023"].columns):
    print("All DataFrames have the same columns.")
else:
    print("DataFrames have different columns.")

All DataFrames have the same columns.


### Zgodnie z założeniami usuwamy Finals, NextGen Finals, United Cup oraz mecze Davis Cup

In [9]:
for year in github:
    github[year]=github[year][~github[year]['tourney_name'].str.contains('Davis Cup', na=False)]
    github[year] = github[year][~github[year]['tourney_name'].isin(['Laver Cup', 'United Cup', 'Tour Finals', 'NextGen Finals','Tokyo Olympics','Atp Cup','Melbourne Summer Set', 'Melbourne', 'Great Ocean Road Open', 'Murray River Open'])]
    github[year]["tourney_name"] = github[year]["tourney_name"].replace({'Belgrade ': 'Belgrade'})

### W tej ramce kolumna 'tourney_name' to dla większości turniejów lokalizacja turnieju a nie jego nazwa, a potrzebujemy żeby nazwa turnieju była taka jak w ramce bets. Dla turniejów, dla których 'tourney_name' to nie lokalizacja, ręcznie wpisujemy lokalizację

In [10]:
for year in github:
    github[year].rename(columns={'tourney_name': 'tourney_location'}, inplace=True)
    github[year]['tourney_location'] = github[year]['tourney_location'].replace({"Australian Open": "Melbourne", "Indian Wells Masters": "Indian Wells", "Miami Masters": "Miami", "Monte Carlo Masters": "Monte Carlo", "Madrid Masters": "Madrid", "Rome Masters": "Rome", "Roland Garros": "Paris", "s Hertogenbosch": "'s-Hertogenbosch", "Queen's Club": "Queens Club", "Wimbledon": "London", "Canada Masters": "Toronto", "Cincinnati Masters": "Cincinnati", "Us Open": "New York", "Astana": "Nur-Sultan", "Shanghai Masters": "Shanghai", "Paris Masters": "Paris 2", "Rio De Janeiro": "Rio de Janeiro"})

## Po sprawdzeniu danych historycznych wykryliśmy błąd

In [11]:
github['2023']['loser_name'] = github['2023']['loser_name'].replace('Eduardo Nava', 'Emilio Nava')
github['2023']['loser_id'] = github['2023']['loser_id'].replace(124013, 207182)

### Sprawdzamy, że utworzona przez nas kolumnna 'tourney_location' odpowiada kolumnie 'tourney_location' w ramce bets

In [12]:
for year in github:
    print(github[year][~github[year]['tourney_location'].isin(bets[year]['tourney_location'])]['tourney_location'].unique())
    print(bets[year][~bets[year]['tourney_location'].isin(github[year]['tourney_location'])]['tourney_location'].unique())

[]
[]
[]
[]
[]
[]


### Potrzebujemy mieć takie same nazwy zawodników w obu ramkach, aby dodać do ramki bets id zawodnika z ramki github. Aktualnie w ramce bets nazwa zawodnika jest w formacie typu Djokovic N. a w ramce github w formacie typu Novak Djokovic, dlatego musimy stworzyć nazwy zawodników w skróconym w formacie w ramce github.

In [13]:
def transform_name(name: str) -> str:
    name_parts = name.split()
    first_name = name_parts[0]
    last_name = ' '.join(name_parts[1:])
    return f"{last_name} {first_name[0]}."

In [14]:
for year in github:
    github[year]['shortened_winner_name']=github[year]['winner_name'].apply(transform_name)
    github[year]['shortened_loser_name']=github[year]['loser_name'].apply(transform_name)

### Sprawdzamy czy skrócona nazwa zawodnika jednoznacznie określa zawodnika

In [15]:
for year in github:
    print(github[year]['shortened_winner_name'].nunique() == github[year]['winner_id'].nunique())
    print(github[year]['shortened_loser_name'].nunique() == github[year]['loser_id'].nunique())

True
False
True
False
True
True


## Zobaczmy, którzy zawodnicy nie są jednoznacznie określeni przez skróconą nazwę

In [16]:
non_unique_losers={}
for year in github:
    loser_groups = github[year].groupby('shortened_loser_name')['loser_id'].nunique()
    non_unique_loser_names = loser_groups[loser_groups > 1].index.tolist()
    non_unique_losers[year] = github[year][github[year]['shortened_loser_name'].isin(non_unique_loser_names)][['shortened_loser_name', 'loser_id']].drop_duplicates()

for i in range(2021,2024):
    if not non_unique_losers[str(i)].empty:
        print(f"Lata {i} - Przegrani z niejednoznacznymi skróconymi nazwami:")
        print(non_unique_losers[str(i)])

Lata 2021 - Przegrani z niejednoznacznymi skróconymi nazwami:
     shortened_loser_name  loser_id
448               Nava E.    207182
1976              Nava E.    124013
Lata 2022 - Przegrani z niejednoznacznymi skróconymi nazwami:
     shortened_loser_name  loser_id
310             Martin A.    105413
1793            Martin A.    211346


105413 - Martin Andrej -> Martin A.
211346 - Martin Andres -> Martin An.

## W celu rozróżnienia zawodników do skrótu imienia jednego z nich dopsujemy drugą literkę imienia

In [17]:
for year in github:
    github[year].loc[github[year]['loser_id'] == 124013, 'shortened_loser_name'] = 'Nava Ed.'
    github[year].loc[github[year]['loser_id'] == 211346, 'shortened_loser_name'] = 'Martin An.'
    github[year].loc[github[year]['winner_id'] == 211346, 'shortened_winner_name'] = 'Martin An.'

## Od razu musimy też wykonać tą samę zmianę w ramce bets

In [18]:
bets["2023"].loc[(bets["2023"]['Tournament'] == 'Atlanta Open') & (bets["2023"]['Winner'] == 'Martin A.'), 'Winner'] = 'Martin An.'
bets["2023"].loc[(bets["2023"]['Tournament'] == 'Atlanta Open') & (bets["2023"]['Loser'] == 'Martin A.'), 'Loser'] = 'Martin An.'
bets["2022"].loc[(bets["2022"]['Tournament'] == 'Atlanta Open') & (bets["2022"]['Winner'] == 'Martin A.'), 'Winner'] = 'Martin An.'
bets["2022"].loc[(bets["2022"]['Tournament'] == 'Atlanta Open') & (bets["2022"]['Loser'] == 'Martin A.'), 'Loser'] = 'Martin An.'
bets["2021"].loc[(bets["2021"]['tourney_location'] == 'Winston-Salem') & (bets["2021"]['Loser'] == 'Nava E.'), 'Loser'] = 'Nava Ed.'

### Niektóre imiona i nazwiska ze względu na swoją unikalność lub brak konsekwencji w zapisie musimy zmodyfikować ręcznie

In [19]:
github['2023']['shortened_loser_name'] = github['2023']['shortened_loser_name'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2022']['shortened_loser_name'] = github['2022']['shortened_loser_name'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2021']['shortened_loser_name'] = github['2021']['shortened_loser_name'].replace({"Patrick Smith J.":"Smith J.P.","Shannan Zayid M.":"Zayid M.","Hsun Lu Y.":"Lu Y.","Aragone J.":"Aragone J.C.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})

In [20]:
github['2023']['shortened_winner_name'] = github['2023']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J. P.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2022']['shortened_winner_name'] = github['2022']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J. P.","Meligeni Alves F.":"Meligeni Rodrigues F","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})
github['2021']['shortened_winner_name'] = github['2021']['shortened_winner_name'].replace({"Varillas J.P.":"Varillas J. P.","Patrick Smith J.":"Smith J.P.","Shannan Zayid M.":"Zayid M.","Hsun Lu Y.":"Lu Y.","Aragone J.":"Aragone J.C.","Meligeni Alves F.":"Meligeni Rodrigues F","Arnaud Bailly G.":"Bailly G.","Sung Nam J.":"Nam J.S.","Chan Hong S.":"Hong S.","Fa Rodriguez Taverna S.":"Rodriguez Taverna S.","Pucinelli De Almeida M.":"Pucinelli de Almeida M.","Alejandro Hernandez Serrano J.":"Hernandez A.","Marcel Stebe C.":"Stebe C.M.","Martin del Potro J.":"Del Potro J.M.","Marco Moroni G.":"Moroni G.M.","Tsonga J.":"Tsonga J.W.","Ignacio Londero J.":"Londero J.I.","Pablo Ficovich J.":"Ficovich J.P.","C.H. Tseng":"Tseng C. H.","Oconnell C.": "O Connell C.", "Elahi Galan D.": "Galan D.E.", "Auger Aliassime F.": "Auger-Aliassime F.", "Woo Kwon S.": "Kwon S.W.", "Barrios Vera T.": "Barrios M.", "Yunchaokete B.": "Bu Y.", "Manuel Cerundolo J.": "Cerundolo J.M.", "Martin Etcheverry T.": "Etcheverry T.", "Hugues Herbert P.": "Herbert P.H.", "Hsiou Hsu Y.": "Hsu Y.", "Andrea Huesler M.": "Huesler M.A.", "Kuznetsov A.": "Kuznetsov An.", "Son Kwiatkowski T.": "Kwiatkowski T.S.", "Li Z.": "Li Zh.", "Hsin Tseng C.": "Tseng C. H.", "Kumar Mukund S.": "Mukund S.", "Ramos A.": "Ramos-Vinolas A.", "J Wolf J.": "Wolf J.J.", "Zhang Z.": "Zhang Zh.", "Pablo Varillas J.": "Varillas J. P.", "Lennard Struff J.": "Struff J.L.", "Lin Wu T.": "Wu T.L.", "Hans Rehberg M.": "Rehberg M.", "Mpetshi Perricard G.": "Mpetshi G.", "Agustin Tirante T.": "Tirante T.A.", "Alberto Olivieri G.": "Olivieri G.", "Nicolae Madaras D.": "Madaras D.", "Cong Mo Y.": "Mo Y."})

In [21]:
bets['2022']['Loser']=bets['2022']['Loser'].replace({"Tseng C.H.":"Tseng C. H."})
bets['2022']['Loser']=bets['2022']['Loser'].replace({"Varillas J.P.":"Varillas J. P."})
bets['2023']['Loser']=bets['2023']['Loser'].replace({"Tseng C.H.":"Tseng C. H."})
bets['2023']['Loser']=bets['2023']['Loser'].replace({"Varillas J.P.":"Varillas J. P."})
bets['2023']['Loser']=bets['2023']['Loser'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F"})
bets['2021']['Loser']=bets['2021']['Loser'].replace({"Varillas J.P.":"Varillas J. P."})

In [22]:
bets['2022']['Winner']=bets['2022']['Winner'].replace({"Tseng C.H.":"Tseng C. H."})
bets['2022']['Winner']=bets['2022']['Winner'].replace({"Varillas J.P.":"Varillas J. P."})
bets['2023']['Winner']=bets['2023']['Winner'].replace({"Tseng C.H.":"Tseng C. H."})
bets['2023']['Winner']=bets['2023']['Winner'].replace({"Varillas J.P.":"Varillas J. P."})
bets['2023']['Winner']=bets['2023']['Winner'].replace({"Meligeni Alves F.":"Meligeni Rodrigues F"})
bets['2021']['Winner']=bets['2021']['Winner'].replace({"Varillas J.P.":"Varillas J. P."})

## Po poprawkach każdy zawodnik w ramce bets ma już id

In [23]:
for year in github:
    map_player_name_player_id = github[year].groupby('shortened_loser_name', as_index=False)[['shortened_loser_name', 'loser_id']].first()
    mapping = dict(zip(map_player_name_player_id['shortened_loser_name'], map_player_name_player_id['loser_id']))
    bets[year]['loser_id'] = bets[year]['Loser'].map(mapping)
    

In [24]:
for year in bets:
    print(bets[year].loc[bets[year]['loser_id'].isna(), 'Loser'].unique())

[]
[]
[]


In [25]:
for year in github:
    map_player_name_player_id = github[year].groupby('shortened_winner_name', as_index=False)[['shortened_winner_name', 'winner_id']].first()
    mapping = dict(zip(map_player_name_player_id['shortened_winner_name'], map_player_name_player_id['winner_id']))
    bets[year]['winner_id'] = bets[year]['Winner'].map(mapping)

In [26]:
for year in bets:
    print(bets[year].loc[bets[year]['winner_id'].isna(), 'Winner'].unique())

[]
[]
[]


### Skoro mamy już wspólne id zawodnika, oraz id turnieju to możemy w obu ramkach zdefiniować wspólne id meczu. Sprawdzamy czy id meczu jest unikalne dla każdego wiersza w ramkach

In [27]:
for year in github:
    github[year]['match_id'] = github[year]['tourney_location'].astype(str) + '_' + year + '_' + github[year]['winner_id'].astype(str) + '_' + github[year]['loser_id'].astype(str)
    bets[year]['match_id'] = bets[year]['tourney_location'].astype(str) + '_' + year + '_' +  bets[year]['winner_id'].astype(str) + '_' + bets[year]['loser_id'].astype(str)

In [28]:
for year in github:
    print(github[year]['match_id'].nunique() == len(github[year]))
    print(bets[year]['match_id'].nunique() == len(bets[year]))


True
True
True
True
True
True


### W bets 2023 jest źle wpisywany wynik meczu Bu Y. vs Kecmanovic M. w pierwszej rundzie turnieju w Shanghaju., wpisane jest zwycięstwo Kecmanovica a przegrał on ten mecz. Taka sama sytaacja wystąpiła w meczu 1/8 finału pomiędzy Kuzmanov D. a Carbales Baena R. Musimy zamienić ze soba wszystkie kolumny zawierajace wartości dedykowane zwycięzcy/przegranemu

In [29]:
def swap_winner_loser_bets(match_ids: [str], df: pd.DataFrame) -> pd.DataFrame:
    winner_cols = [col for col in df.columns if col.startswith('W') or col.endswith('W') or col == 'Winner']
    loser_cols = [col for col in df.columns if col.startswith('L') or col.endswith('L') or col == 'Loser']
    condition = df['match_id'].isin(match_ids)
    
    if not df[condition].empty:
        for w_col, l_col in zip(winner_cols, loser_cols):
            df.loc[condition, w_col], df.loc[condition, l_col] = df.loc[condition, l_col].values, df.loc[condition, w_col].values
    else:
        print(f"No match found with match_id {match_ids}")
    
    return df

In [30]:
bets['2023'] = swap_winner_loser_bets(["Marrakech_2023_106220_106148", "Shanghai_2023_200175_207352"], bets['2023'])
bets['2023']['match_id'] = bets['2023']['match_id'].replace({"Marrakech_2023_106220_106148": "Marrakech_2023_106148_106220", "Shanghai_2023_200175_207352": "Shanghai_2023_207352_200175"})

### Sprawdźmy jeszcze czy sa jakieś mecze, które sa tylko w jednej ramce.

In [31]:
for year in github:
    print(bets[year][~bets[year]['match_id'].isin(github[year]['match_id'])]['match_id'].unique())
    print(github[year][~github[year]['match_id'].isin(bets[year]['match_id'])]['match_id'].unique())

[]
[]
[]
[]
[]
[]


In [32]:
folder_path = '../data/Processed_data/'
bets['2023'].to_csv(folder_path +'bets2023.csv', index=False)
bets['2022'].to_csv(folder_path +'bets2022.csv', index=False)
bets['2021'].to_csv(folder_path +'bets2021.csv', index=False)
github['2023'].to_csv(folder_path +'github2023.csv', index=False)
github['2022'].to_csv(folder_path +'github2022.csv', index=False)
github['2021'].to_csv(folder_path +'github2021.csv', index=False)