In [17]:
import pandas as pd
import numpy as np

In [18]:
matches = pd.read_csv('../data/merged_matches_with_time.csv')

### Lets see what columns and how many null values they have

In [19]:
missing_values = matches.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
W1                       97
L1                       95
W2                      188
L2                      188
W3                     7192
L3                     7192
W4                    12590
L4                    12590
W5                    13550
L5                    13550
Wsets                    95
Lsets                    97
AvgW                     12
AvgL                     12
winner_seed            7909
winner_entry          11716
winner_ht                37
loser_seed            10435
loser_entry           10510
loser_ht                 99
loser_age                 1
minutes                 189
w_ace                   100
w_df                    100
w_svpt                  100
w_1stIn                 100
w_1stWon                100
w_2ndWon                100
w_SvGms                  99
w_bpSaved               100
w_bpFaced               100
l_ace                   100
l_df                    100
l_svpt                  100
l_1stI

### Null values in Wsets and Lsets are due to match not being Completed, so Walkover or Retired

In [20]:
null_count = matches.loc[matches['Comment'] == 'Completed', 'Wsets'].isnull().sum()

print(f"Number of null values in 'WSets' where 'Comment' equals 'Completed': {null_count}")

Number of null values in 'WSets' where 'Comment' equals 'Completed': 0


In [21]:
missing_winner_ht = matches[matches['winner_ht'].isnull()]['winner_name'].unique()
missing_loser_ht = matches[matches['loser_ht'].isnull()]['loser_name'].unique()
unique_players_missing_ht = pd.unique(np.concatenate((missing_winner_ht, missing_loser_ht)))

print("bez wzrostu:")
print(unique_players_missing_ht)


bez wzrostu:
['Alen Avidzba' 'Flavio Cobolli' 'Gijs Brouwer' 'Manuel Guinard'
 'Camilo Ugo Carabelli' 'Ryan Peniston' 'Alastair Gray'
 'Alexander Ritschard' 'Ben Shelton' 'Andres Martin' 'Filip Misolic'
 'Franco Agamenone' 'Rinky Hijikata' 'Juan Alejandro Hernandez Serrano'
 'Brandon Holt' 'Aleksandar Kovacevic' 'Rio Noguchi' 'Francesco Passaro'
 'Nicolas Alvarez Varona' 'Jabor Al Mutawa' 'Jordi Samper Montana'
 'Takanyi Garanganga' 'Mubarak Shannan Zayid' 'Jack Mingjie Lin'
 'Andrea Basso' 'Cole Gromley' 'Alibek Kachmazov' 'Brian Shi'
 'Shintaro Mochizuki' 'Eduardo Nava' 'Evgenii Tiurnev'
 'Caleb Chakravarthi' 'Juncheng Shang' 'Max Hans Rehberg'
 'Carlos Gimeno Valero' 'Santiago Fa Rodriguez Taverna' 'Jesper De Jong'
 'Henri Squire' 'William Blumberg' 'Hernan Casanova'
 'Alexander Shevchenko' 'Ivan Gakhov' 'Mili Poljicak' 'Dino Prizmic'
 'Rodrigo Pacheco Mendez' 'Nick Chappell' 'Gonzalo Villanueva'
 'Alexis Galarneau' 'Learner Tien' 'Evan Furness' 'Facundo Mena'
 'Seong Chan Hong' 'Yu

### moze w innych wierszach goscie maja podany wzrost

In [22]:
player_heights = {}

for player in unique_players_missing_ht:
    
    winner_height = matches[matches['winner_name'] == player]['winner_ht'].dropna()
    if not winner_height.empty:
        player_heights[player] = winner_height.iloc[0]
        continue
    
    
    loser_height = matches[matches['loser_name'] == player]['loser_ht'].dropna()
    if not loser_height.empty:
        player_heights[player] = loser_height.iloc[0]


print("Zawodnicy, ktorzy maja podany wzrost w df:")
for player, height in player_heights.items():
    print(f"{player}: {height}")

Zawodnicy, ktorzy maja podany wzrost w df:
Flavio Cobolli: 183.0
Gijs Brouwer: 191.0
Camilo Ugo Carabelli: 185.0
Ryan Peniston: 180.0
Alexander Ritschard: 193.0
Ben Shelton: 193.0
Andres Martin: 183.0
Filip Misolic: 180.0
Rinky Hijikata: 178.0
Brandon Holt: 185.0
Aleksandar Kovacevic: 183.0
Francesco Passaro: 180.0
Alibek Kachmazov: 185.0
Shintaro Mochizuki: 175.0
Juncheng Shang: 180.0
Max Hans Rehberg: 183.0
Jesper De Jong: 180.0
Alexander Shevchenko: 185.0
Ivan Gakhov: 191.0
Dino Prizmic: 188.0
Rodrigo Pacheco Mendez: 188.0
Nick Chappell: 178.0
Alexis Galarneau: 180.0
Learner Tien: 180.0
Yunseong Chung: 178.0
Beibit Zhukayev: 198.0
Sho Shimabukuro: 180.0
Luca Van Assche: 178.0
Emilio Nava: 183.0


spore rozczarowanie...

### Zmiana brakujacych wartosci w kolumnach setowych, dla meczów zakonczonych przez: walkover retired awarded

In [23]:
matches['Comment'] = matches['Comment'].replace('Rrtired', 'Retired')

In [24]:

columns_to_update = ['W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2', 'L3', 'L4', 'L5','Wsets','Lsets', 'minutes']
matches.loc[matches['Comment'].isin(['Disqualified','Retired', 'Walkover', 'Awarded']), columns_to_update]=matches.loc[matches['Comment'].isin(['Disqualified','Retired', 'Walkover', 'Awarded']), columns_to_update].fillna(0)

In [25]:
matches.loc[matches['match_id'] =='Estoril_2022_202104_126207', 'Wsets'] = 2
matches.loc[matches['match_id'] =='Metz_2019_104542_105379', ['W2','L2', 'W3','L3', 'Wsets']] = [7, 6, 6, 3, 2]

### Entry

In [26]:
# Reszty nie chcemy trzymać, bo to nie wiadomo co albo niepotrzebe
values_to_encode = ['WC', 'Q', 'LL']

# Replace unwanted values with NaN
matches['winner_entry'] = matches['winner_entry'].where(matches['winner_entry'].isin(values_to_encode))
matches['loser_entry'] = matches['loser_entry'].where(matches['loser_entry'].isin(values_to_encode))

# One-hot encode the filtered column
matches = pd.get_dummies(matches, columns=['winner_entry'], prefix='winner_entry', prefix_sep='_')
matches = pd.get_dummies(matches, columns=['loser_entry'], prefix='loser_entry', prefix_sep='_')

### Seed (rozstawienie), zamieniamy kolumnę która określała nr rozstawienia zawodnika i było NaN gdy zawodnik był nierozstawiony na binarną kolumnę określającą czy zawodnik był rozstawione

In [27]:
matches['winner_is_seeded'] = matches['winner_seed'].notna().astype(int)
matches['loser_is_seeded'] = matches['loser_seed'].notna().astype(int)
matches=matches.drop(columns=['winner_seed', 'loser_seed'])

### Minutes, dla meczy które się ukończyły braki danych zastępujemy średnim czasem meczu

In [28]:
average_minutes = matches[matches['Comment'] == 'Completed'].groupby('best_of')['minutes'].mean().round()
matches.loc[(matches['Comment'] == 'Completed') & (matches['minutes'].isnull()), 'minutes'] = matches['best_of'].map(average_minutes)

### Rank i Rank points, Nan dla Rank oraz Rank points oznacza zawodnika bez rankingu

In [29]:
matches.loc[matches['winner_rank'].isnull(), 'winner_rank'] = 2000
matches.loc[matches['loser_rank'].isnull(), 'loser_rank'] = 2000
matches.loc[matches['winner_rank_points'].isnull(), 'winner_rank_points'] = 0
matches.loc[matches['loser_rank_points'].isnull(), 'loser_rank_points'] = 0

### AGE, tylko 2 braki danych, ręcznie poprawiamy

In [30]:
matches.loc[matches['loser_name']=='Liam Krall', 'loser_age'] = 21
matches.loc[matches['loser_name']=='Manas Dhamne', 'loser_age'] = 21

In [31]:
missing_values = matches.isnull().sum()

columns_with_nan = missing_values[missing_values > 0]

print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
W3            6832
L3            6832
W4           12171
L4           12171
W5           13119
L5           13119
AvgW            12
AvgL            12
winner_ht       37
loser_ht        99
w_ace          100
w_df           100
w_svpt         100
w_1stIn        100
w_1stWon       100
w_2ndWon       100
w_SvGms         99
w_bpSaved      100
w_bpFaced      100
l_ace          100
l_df           100
l_svpt         100
l_1stIn        100
l_1stWon       100
l_2ndWon       100
l_SvGms         99
l_bpSaved      100
l_bpFaced      100
dtype: int64


In [32]:
matches.to_csv('../data/matches.csv', index=False)