In [4]:
import pandas as pd
import numpy as np

In [40]:
matches = pd.read_csv('../data/merged_year_data/matches.csv')

In [49]:
# Check for missing values in the DataFrame
missing_values = matches.isnull().sum()

# Filter columns with missing values
columns_with_nan = missing_values[missing_values > 0]

# Display the columns and their corresponding NaN counts
print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
W3            6832
L3            6832
W4           12171
L4           12171
W5           13119
L5           13119
winner_ht      175
loser_ht       327
w_ace          100
w_df           100
w_svpt         100
w_1stIn        100
w_1stWon       100
w_2ndWon       100
w_SvGms         99
w_bpSaved      100
w_bpFaced      100
l_ace          100
l_df           100
l_svpt         100
l_1stIn        100
l_1stWon       100
l_2ndWon       100
l_SvGms         99
l_bpSaved      100
l_bpFaced      100
dtype: int64


### Null values in Wsets and Lsets are due to match not being Completed, so Walkover or Retired

In [4]:
null_count = matches.loc[matches['Comment'] == 'Completed', 'Wsets'].isnull().sum()

print(f"Number of null values in 'WSets' where 'Comment' equals 'Completed': {null_count}")

Number of null values in 'WSets' where 'Comment' equals 'Completed': 0


In [None]:
missing_winner_ht = matches[matches['winner_ht'].isnull()]['winner_name'].unique()
missing_loser_ht = matches[matches['loser_ht'].isnull()]['loser_name'].unique()
unique_players_missing_ht = pd.unique(np.concatenate((missing_winner_ht, missing_loser_ht)))

print("bez wzrostu:")
print(unique_players_missing_ht)


bez wzrostu:
['Flavio Cobolli' 'Ben Shelton' 'Juncheng Shang' 'Dalibor Svrcina'
 'Rinky Hijikata' 'Brandon Holt' 'Luciano Darderi' 'Camilo Ugo Carabelli'
 'Matija Pecotic' 'Gijs Brouwer' 'Alexander Ritschard' 'Riccardo Bonadio'
 'Jacopo Berrettini' 'Aleksandar Kovacevic' 'Francesco Passaro'
 'Luca Van Assche' 'Ivan Gakhov' 'Alexander Shevchenko'
 'Abedallah Shelbayh' 'Pablo Llamas Ruiz' 'Genaro Alberto Olivieri'
 'Ryan Peniston' 'Jan Choinski' 'Filip Misolic' 'Alex Michelsen'
 'Ethan Quinn' 'Dino Prizmic' 'Sho Shimabukuro' 'Gabriel Diallo'
 'Omni Kumar' 'Jakub Mensik' 'Titouan Droguet' 'Yu Hsiou Hsu'
 'Philip Sekulic' 'Alibek Kachmazov' 'Bu Yunchaokete' 'Beibit Zhukayev'
 'Terence Atmane' 'Mark Lajal' 'Giovanni Mpetshi Perricard'
 'Shintaro Mochizuki' 'Billy Harris' 'Manuel Guinard' 'Alastair Gray'
 'Franco Agamenone' 'Andres Martin' 'Juan Alejandro Hernandez Serrano'
 'Rio Noguchi' 'Nicolas Alvarez Varona' 'Alen Avidzba' 'Manas Dhamne'
 'Kiranpal Pannu' 'Mattia Bellucci' 'Oleksii Krut

### moze w innych wierszach goscie maja podany wzrost

In [7]:
player_heights = {}

for player in unique_players_missing_ht:
    
    winner_height = matches[matches['winner_name'] == player]['winner_ht'].dropna()
    if not winner_height.empty:
        player_heights[player] = winner_height.iloc[0]
        continue
    
    
    loser_height = matches[matches['loser_name'] == player]['loser_ht'].dropna()
    if not loser_height.empty:
        player_heights[player] = loser_height.iloc[0]


print("Zawodnicy, ktorzy maja podany wzrost w df:")
for player, height in player_heights.items():
    print(f"{player}: {height}")

Zawodnicy, ktorzy maja podany wzrost w df:
Emilio Nava: 183.0


spore rozczarowanie...

### Zmiana brakujacych wartosci w kolumnach setowych, dla meczów zakonczonych przez: walkover retired awarded

In [41]:
matches['Comment'] = matches['Comment'].replace('Rrtired', 'Retired')

In [42]:

columns_to_update = ['W1', 'W2', 'W3', 'W4', 'W5', 'L1', 'L2', 'L3', 'L4', 'L5','Wsets','Lsets', 'minutes']
matches.loc[matches['Comment'].isin(['Disqualified','Retired', 'Walkover', 'Awarded']), columns_to_update]=matches.loc[matches['Comment'].isin(['Disqualified','Retired', 'Walkover', 'Awarded']), columns_to_update].fillna(0)

In [43]:
matches.loc[matches['match_id'] =='Estoril_2022_202104_126207', 'Wsets'] = 2
matches.loc[matches['match_id'] =='Metz_2019_104542_105379', ['W2','L2', 'W3','L3', 'Wsets']] = [7, 6, 6, 3, 2]

### Entry

In [44]:
# Reszty nie chcemy trzymać, bo to nie wiadomo co albo niepotrzebe
values_to_encode = ['WC', 'Q', 'LL']

# Replace unwanted values with NaN
matches['winner_entry'] = matches['winner_entry'].where(matches['winner_entry'].isin(values_to_encode))
matches['loser_entry'] = matches['loser_entry'].where(matches['loser_entry'].isin(values_to_encode))

# One-hot encode the filtered column
matches = pd.get_dummies(matches, columns=['winner_entry'], prefix='winner_entry', prefix_sep='_')
matches = pd.get_dummies(matches, columns=['loser_entry'], prefix='loser_entry', prefix_sep='_')

### Seed

In [45]:
matches['W_IsSeeded'] = matches['winner_seed'].notna().astype(int)
matches['L_IsSeeded'] = matches['loser_seed'].notna().astype(int)
matches=matches.drop(columns=['winner_seed', 'loser_seed'])

### Minutes

In [46]:
average_minutes = matches[matches['Comment'] == 'Completed'].groupby('best_of')['minutes'].mean().round()
matches.loc[(matches['Comment'] == 'Completed') & (matches['minutes'].isnull()), 'minutes'] = matches['best_of'].map(average_minutes)

### Rank i Rank points

In [47]:
matches.loc[matches['winner_rank'].isnull(), 'winner_rank'] = 1000
matches.loc[matches['loser_rank'].isnull(), 'loser_rank'] = 1000
matches.loc[matches['winner_rank_points'].isnull(), 'winner_rank_points'] = 0
matches.loc[matches['loser_rank_points'].isnull(), 'loser_rank_points'] = 0

### AGE

In [48]:
matches.loc[matches['loser_name']=='Liam Krall', 'loser_age'] = 21
matches.loc[matches['loser_name']=='Manas Dhamne', 'loser_age'] = 21

In [50]:
matches.to_csv('../data/merged_year_data/matches_cleaned.csv', index=False)