In [25]:
import pandas as pd
import os

In [26]:
# Define the directory containing your CSV files
directory = "../../feature_creation/data/created_features_separate"

# Initialize an empty DataFrame to merge all features
merged_new_features = None

# Loop through all CSV files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and filename != "features_merged.csv" and filename != "aggregate_player_match_stats_diff.csv":
        file_path = os.path.join(directory, filename)
        print(f"Processing {filename}...")  
        
        # Read the current CSV file
        df = pd.read_csv(file_path)
        
        # Merge the current DataFrame with the merged_df on 'match_id'
        if merged_new_features is None:
            merged_new_features = df  # First file initializes the DataFrame
        else:
            merged_new_features = pd.merge(merged_new_features, df, on="match_id", how="outer")

print("All features merged")

Processing aggregate_player_match_stats.csv...
Processing elo.csv...
Processing fatigue.csv...
Processing h2h.csv...
Processing home_advantage.csv...
Processing injury.csv...
Processing last_10_win_record.csv...
Processing player_current_tournament_record.csv...
Processing tournament_win_loss_history.csv...
Processing weather.csv...
All features merged


In [27]:
preprocessed_matches = pd.read_csv("../../preprocessing/data/matches.csv")

In [28]:
merged_new_features.to_csv("../../feature_creation/data/features_merged.csv", index=False)

In [29]:
matches = pd.merge(preprocessed_matches, merged_new_features, on="match_id", how="outer")
matches = matches.sort_values(by=['Date']).reset_index(drop=True)

In [30]:
matches

Unnamed: 0,tournament_location,tournament_name,Date,tournament_level,indoor_or_outdoor,Surface,Round,W1,L1,W2,...,Loser_Set_Diff_Tournament,Loser_Game_Diff_Tournament,winner_total_wins_tournament_history,winner_total_losses_tournament_history,loser_total_wins_tournament_history,loser_total_losses_tournament_history,temperature_2m,relative_humidity_2m,windspeed_10m,apparent_temperature
0,Brisbane,Brisbane International,2017-12-31,ATP250,Outdoor,Hard,1st Round,7.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,27.0,77,9.6,30.9
1,Brisbane,Brisbane International,2017-12-31,ATP250,Outdoor,Hard,1st Round,6.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,28.9,68,15.2,31.8
2,Brisbane,Brisbane International,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,19.7,52,8.8,18.3
3,Pune,Tata Open,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,24.2,83,6.9,27.6
4,Pune,Tata Open,2018-01-01,ATP250,Outdoor,Hard,1st Round,6.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,16.2,56,10.9,13.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,Sofia,Sofia Open,2023-11-10,ATP250,Indoor,Hard,Semifinals,6.0,2.0,7.0,...,3.0,8.0,5.0,4.0,3.0,0.0,6.6,88,13.7,3.3
14107,Metz,Open de Moselle,2023-11-10,ATP250,Indoor,Hard,Semifinals,6.0,0.0,6.0,...,5.0,11.0,3.0,4.0,3.0,0.0,16.1,49,4.6,14.3
14108,Metz,Open de Moselle,2023-11-10,ATP250,Indoor,Hard,Semifinals,6.0,4.0,6.0,...,5.0,10.0,3.0,0.0,4.0,2.0,16.1,49,4.6,14.3
14109,Metz,Open de Moselle,2023-11-11,ATP250,Indoor,Hard,The Final,6.0,3.0,6.0,...,7.0,14.0,4.0,4.0,4.0,0.0,13.0,57,4.5,11.0


In [31]:
old_matches = pd.read_csv("../../old_data/everything_before_models/matches.csv")

In [32]:
import pandas as pd

def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    # Check if columns are the same
    if not df1.columns.equals(df2.columns):
        print("The DataFrames have different columns.")
        columns_only_in_df1 = set(df1.columns) - set(df2.columns)
        columns_only_in_df2 = set(df2.columns) - set(df1.columns)
        
        if columns_only_in_df1:
            print(f"Columns only in the first DataFrame: {columns_only_in_df1}")
        if columns_only_in_df2:
            print(f"Columns only in the second DataFrame: {columns_only_in_df2}")
        
        return False

    # Check if shapes are the same
    if df1.shape != df2.shape:
        print("The DataFrames have different shapes.")
        print(f"Shape of first DataFrame: {df1.shape}")
        print(f"Shape of second DataFrame: {df2.shape}")
        return False

    # Check if all values are the same
    if not df1.equals(df2):
        print("The DataFrames have different values.")
        mismatched_rows = (df1 != df2).any(axis=1)
        print(f"Number of mismatched rows: {mismatched_rows.sum()}")
        print("Example of mismatched rows from each DataFrame:")
        print("First DataFrame:")
        print(df1[mismatched_rows].head())
        print("Second DataFrame:")
        print(df2[mismatched_rows].head())
        return False

    # If all checks pass
    print("The DataFrames are identical.")
    return True

# Example usage
are_equal = compare_dataframes(old_matches, matches)
print(f"Are the DataFrames the same? {are_equal}")


The DataFrames have different columns.
Columns only in the first DataFrame: {'winner_service_games_won_pct', 'Loser_Total_aces', 'Loser_Total_minutes', 'Winner_Total_double_errors', 'Winner_Total_points_won_after_1st_serve', 'longitude', 'w_2ndIn', 'winner_1st_serve_return_win_pct', 'Winner_Total_2nd_serves', 'Winner_Total_points_won_after_2nd_serve', 'loser_total_1st_serve_win_pct', 'winner_CO_df_avg', 'loser_CO_ace_avg', 'loser_CO_serve_games_win_pct_avg', 'loser_bp_won_pct', 'loser_1st_serve_return_win_pct', 'Winner_Total_aces', 'loser_2nd_serve_win_pct', 'loser_CO_df_avg', 'loser_service_games_won_pct', 'winner_tournament_losses_before', 'winner_return_games_win_pct', 'l_2ndIn', 'loser_2nd_serve_return_win_pct', 'winner_2nd_serve_return_win_pct', 'winner_1st_serve_in_pct', 'loser_2nd_serve_in_pct', 'h2h_p2_wins', 'Loser_Total_serves', 'winner_CO_ace_avg', 'loser_bp_saved_pct', 'loser_df_avg', 'winner_total_1st_serve_in_pct', 'loser_1st_serve_win_pct', 'winner_2nd_serve_win_pct', 'l

In [33]:
list(matches.columns)

['tournament_location',
 'tournament_name',
 'Date',
 'tournament_level',
 'indoor_or_outdoor',
 'Surface',
 'Round',
 'W1',
 'L1',
 'W2',
 'L2',
 'W3',
 'L3',
 'W4',
 'L4',
 'W5',
 'L5',
 'Wsets',
 'Lsets',
 'Comment',
 'AvgW',
 'AvgL',
 'loser_id',
 'winner_id',
 'match_id',
 'tournament_id',
 'draw_size',
 'tournament_date',
 'winner_name',
 'winner_hand',
 'winner_ht',
 'winner_ioc',
 'winner_age',
 'loser_name',
 'loser_hand',
 'loser_ht',
 'loser_ioc',
 'loser_age',
 'best_of',
 'minutes',
 'w_ace',
 'w_df',
 'w_svpt',
 'w_1stIn',
 'w_1stWon',
 'w_2ndWon',
 'w_SvGms',
 'w_bpSaved',
 'w_bpFaced',
 'l_ace',
 'l_df',
 'l_svpt',
 'l_1stIn',
 'l_1stWon',
 'l_2ndWon',
 'l_SvGms',
 'l_bpSaved',
 'l_bpFaced',
 'winner_rank',
 'winner_rank_points',
 'loser_rank',
 'loser_rank_points',
 'time',
 'winner_entry_LL',
 'winner_entry_Q',
 'winner_entry_WC',
 'loser_entry_LL',
 'loser_entry_Q',
 'loser_entry_WC',
 'winner_is_seeded',
 'loser_is_seeded',
 'w_ace_avg',
 'l_ace_avg',
 'w_CO_ace_avg

### Dropimy kolumny, które nie będą używane do predykcji

In [34]:
matches = matches.drop(columns=["W1", "L1", "W2", "L2", "W3", "L3", "W4", "L4", "W5", "L5", "Wsets", "Lsets", "tournament_location", "tournament_name", "loser_id", "winner_id", "tournament_id", "draw_size", "tournament_date", "winner_name", "winner_ioc", "winner_ht", "loser_name", "loser_ht", "loser_ioc", "winner_rank_points", "loser_rank_points", "tournament_country", "Round", "time", 
'minutes',
'w_ace',
 'w_df',
 'w_svpt',
 'w_1stIn',
 'w_1stWon',
 'w_2ndWon',
 'w_SvGms',
 'w_bpSaved',
 'w_bpFaced',
'l_ace',
 'l_df',
 'l_svpt',
 'l_1stIn',
 'l_1stWon',
 'l_2ndWon',
 'l_SvGms',
 'l_bpSaved',
 'l_bpFaced',
 'Comment'  # Do jeszcze do pokmininenia
                                ])

### Sprawdźmy czy nie mamy braków danych

In [35]:
# Check for missing values in the DataFrame
missing_values = matches.isnull().sum()

# Filter columns with missing values
columns_with_nan = missing_values[missing_values > 0]

# Display the columns and their corresponding NaN counts
print("Columns with NaN values and their counts:")
print(columns_with_nan)

Columns with NaN values and their counts:
AvgW    12
AvgL    12
dtype: int64


### Chcemy, żeby wszystkie kolumny były numeryczne

In [36]:
def is_numeric_or_castable(series):
    try:
        pd.to_numeric(series)
        return True
    except TypeError:
        print(series)
    except ValueError:
        return False

In [37]:
problematic_columns = [col for col in matches.columns if not is_numeric_or_castable(matches[col])]

if not problematic_columns:
    print("All columns are numeric or can be cast to numeric values.")
else:
    print(f"The following columns are not numeric or contain non-castable values: {problematic_columns}")

The following columns are not numeric or contain non-castable values: ['Date', 'tournament_level', 'indoor_or_outdoor', 'Surface', 'match_id', 'winner_hand', 'loser_hand']


In [38]:
matches['indoor_or_outdoor'].unique()

array(['Outdoor', 'Indoor'], dtype=object)

In [39]:
matches['indoor_or_outdoor'] = matches['indoor_or_outdoor'].map({'Outdoor': 1, 'Indoor': 0})
matches.rename(columns={'indoor_or_outdoor': 'outdoor'}, inplace=True)

In [40]:
matches['Surface'].unique()

array(['Hard', 'Clay', 'Grass'], dtype=object)

In [41]:
matches = pd.get_dummies(matches, columns=['Surface'], prefix='Surface')

In [42]:
matches["tournament_level"].unique()

array(['ATP250', 'Grand Slam', 'ATP500', 'Masters 1000'], dtype=object)

### Tu mi się nie podoba że jest ten level z githuba że jest A dla atp250 i atp500, kiedyś do poprawy, żeby były oddzielne levele dla atp250 i atp500

In [43]:
matches["tournament_level"] = matches["tournament_level"].map({"A": 0, "M": 1, "G": 2})

In [44]:
matches["winner_hand"].unique()
matches["loser_hand"].unique()

array(['R', 'L', 'U'], dtype=object)

In [45]:
matches["winner_hand"] = matches["winner_hand"].map({"R": 1, "L": 0, "U": 0}) # U traktujemy jak lewą, bo nie wiem ilu graczy jest serio oburęcznych
matches.rename(columns={'winner_hand': 'winner_right_handed'}, inplace=True)
matches["loser_hand"] = matches["loser_hand"].map({"R": 1, "L": 0, "U": 0})
matches.rename(columns={'loser_hand': 'loser_right_handed'}, inplace=True)

In [46]:
matches["best_of"].unique()

array([3, 5])

In [47]:
matches["best_of"] = matches["best_of"].map({3: 1, 5: 0})

In [48]:
matches

Unnamed: 0,Date,tournament_level,outdoor,AvgW,AvgL,match_id,winner_right_handed,winner_age,loser_right_handed,loser_age,...,winner_total_losses_tournament_history,loser_total_wins_tournament_history,loser_total_losses_tournament_history,temperature_2m,relative_humidity_2m,windspeed_10m,apparent_temperature,Surface_Clay,Surface_Grass,Surface_Hard
0,2017-12-31,,1,2.96,1.39,Brisbane_2018_200282_105449,1,18.8,1,28.0,...,0.0,0.0,0.0,27.0,77,9.6,30.9,False,False,True
1,2017-12-31,,1,2.17,1.68,Brisbane_2018_105238_106043,1,29.1,1,25.3,...,0.0,0.0,0.0,28.9,68,15.2,31.8,False,False,True
2,2018-01-01,,1,2.20,1.66,Brisbane_2018_104547_124014,0,32.6,1,21.4,...,0.0,0.0,0.0,19.7,52,8.8,18.3,False,False,True
3,2018-01-01,,1,1.24,3.99,Pune_2018_105916_106044,1,25.8,1,25.3,...,0.0,0.0,0.0,24.2,83,6.9,27.6,False,False,True
4,2018-01-01,,1,3.19,1.34,Pune_2018_106120_106210,1,24.9,0,24.4,...,0.0,0.0,0.0,16.2,56,10.9,13.9,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14106,2023-11-10,,0,1.62,2.29,Sofia_2023_105173_200303,0,35.3,1,24.9,...,4.0,3.0,0.0,6.6,88,13.7,3.3,False,False,True
14107,2023-11-10,,0,1.21,4.31,Metz_2023_200005_104926,0,25.3,1,36.4,...,4.0,3.0,0.0,16.1,49,4.6,14.3,False,False,True
14108,2023-11-10,,0,1.46,2.69,Metz_2023_207686_105732,1,22.9,1,32.6,...,0.0,4.0,2.0,16.1,49,4.6,14.3,False,False,True
14109,2023-11-11,,0,1.35,3.19,Metz_2023_200005_207686,0,25.3,1,22.9,...,4.0,4.0,0.0,13.0,57,4.5,11.0,False,False,True


In [49]:
problematic_columns = [col for col in matches.columns if not is_numeric_or_castable(matches[col])]

if not problematic_columns:
    print("All columns are numeric or can be cast to numeric values.")
else:
    print(f"The following columns are not numeric or contain non-castable values: {problematic_columns}")

The following columns are not numeric or contain non-castable values: ['Date', 'match_id']


In [50]:
list(matches.columns)

['Date',
 'tournament_level',
 'outdoor',
 'AvgW',
 'AvgL',
 'match_id',
 'winner_right_handed',
 'winner_age',
 'loser_right_handed',
 'loser_age',
 'best_of',
 'winner_rank',
 'loser_rank',
 'winner_entry_LL',
 'winner_entry_Q',
 'winner_entry_WC',
 'loser_entry_LL',
 'loser_entry_Q',
 'loser_entry_WC',
 'winner_is_seeded',
 'loser_is_seeded',
 'w_ace_avg',
 'l_ace_avg',
 'w_CO_ace_avg',
 'l_CO_ace_avg',
 'w_df_avg',
 'l_df_avg',
 'w_CO_df_avg',
 'l_CO_df_avg',
 'w_2ndIn_avg',
 'l_2ndIn_avg',
 'w_CO_2ndIn_avg',
 'l_CO_2ndIn_avg',
 'winner_1st_serve_in_pct_avg',
 'loser_1st_serve_in_pct_avg',
 'winner_CO_1st_serve_in_pct_avg',
 'loser_CO_1st_serve_in_pct_avg',
 'winner_1st_serve_win_pct_avg',
 'loser_1st_serve_win_pct_avg',
 'winner_CO_1st_serve_win_pct_avg',
 'loser_CO_1st_serve_win_pct_avg',
 'winner_2nd_serve_in_pct_avg',
 'loser_2nd_serve_in_pct_avg',
 'winner_CO_2nd_serve_in_pct_avg',
 'loser_CO_2nd_serve_in_pct_avg',
 'winner_2nd_serve_win_pct_avg',
 'loser_2nd_serve_win_pct_avg

### Chcemy żeby kolumny nazywały się player1/player2 a nie winner/loser

In [51]:
def rename_player_columns(col):
    col_lower = col.lower()  # Convert to lowercase for consistent checks

    # Check if the column starts with "winner" or "w_"
    if col_lower.startswith('winner'):
        return col.replace('Winner', 'player1').replace('winner', 'player1')

    # Check if the column starts with "loser" or "l_"
    elif col_lower.startswith('loser'):
        return col.replace('Loser', 'player2').replace('loser', 'player2')

    # Check if the column ends with "winner"
    elif col_lower.endswith('winner'):
        return f"player1_{col.replace('_winner', '').replace('Winner', '').replace('winner', '')}".strip('_')

    # Check if the column ends with "loser"
    elif col_lower.endswith('loser'):
        return f"player2_{col.replace('_loser', '').replace('Loser', '').replace('loser', '')}".strip('_')

    return col  # Keep other columns unchanged


# Apply renaming
matches.columns = [rename_player_columns(col) for col in list(matches.columns)]

# Create a DataFrame to showcase the new column names
print(list(matches.columns))

['Date', 'tournament_level', 'outdoor', 'AvgW', 'AvgL', 'match_id', 'player1_right_handed', 'player1_age', 'player2_right_handed', 'player2_age', 'best_of', 'player1_rank', 'player2_rank', 'player1_entry_LL', 'player1_entry_Q', 'player1_entry_WC', 'player2_entry_LL', 'player2_entry_Q', 'player2_entry_WC', 'player1_is_seeded', 'player2_is_seeded', 'w_ace_avg', 'l_ace_avg', 'w_CO_ace_avg', 'l_CO_ace_avg', 'w_df_avg', 'l_df_avg', 'w_CO_df_avg', 'l_CO_df_avg', 'w_2ndIn_avg', 'l_2ndIn_avg', 'w_CO_2ndIn_avg', 'l_CO_2ndIn_avg', 'player1_1st_serve_in_pct_avg', 'player2_1st_serve_in_pct_avg', 'player1_CO_1st_serve_in_pct_avg', 'player2_CO_1st_serve_in_pct_avg', 'player1_1st_serve_win_pct_avg', 'player2_1st_serve_win_pct_avg', 'player1_CO_1st_serve_win_pct_avg', 'player2_CO_1st_serve_win_pct_avg', 'player1_2nd_serve_in_pct_avg', 'player2_2nd_serve_in_pct_avg', 'player1_CO_2nd_serve_in_pct_avg', 'player2_CO_2nd_serve_in_pct_avg', 'player1_2nd_serve_win_pct_avg', 'player2_2nd_serve_win_pct_avg', '

In [52]:
matches = matches.rename({"AvgW": 'player1_bet_odds', 'AvgL': 'player2_bet_odds'}, axis=1)

In [53]:
# Convert boolean columns to integers
matches = matches.astype({col: 'int64' for col in matches.select_dtypes(include=['bool']).columns})

### Tworzymy target który jest 1 gdy player1 wygrał i 0 wpp, i ustawiamy narazie wszystko na 1 bo winner został zmienionu na player1

In [54]:
matches["target"] = 1

  matches["target"] = 1


In [55]:
matches = matches.copy()

### Żeby nie było tak że zawsze to player1 jest wygranym to mieszamy w losowo wybranych rzędach staty pomiędzy player1 a player2 i w tych rzędach zmieniamy target z 1 na 0

In [56]:
import numpy as np

In [57]:
# Randomly select rows to swap
np.random.seed(42)  # For reproducibility
rows_to_swap = np.random.choice(matches.index, size=int(len(matches) * 0.5), replace=False)  # Swap 50% of rows

# Swap player1 and player2 stats and change the sign of difference columns for the selected rows
for col in matches.columns:
    if col.startswith('player1_'):
        corresponding_col = col.replace('player1_', 'player2_')
        matches.loc[rows_to_swap, [col, corresponding_col]] = matches.loc[rows_to_swap, [corresponding_col, col]].values
        
    if col.endswith('_diff'): 
        matches.loc[rows_to_swap, col] *= -1

# Update the target column for swapped rows
matches.loc[rows_to_swap, 'target'] = 0  # Player 2 is the winner


In [59]:
print(list(matches.columns))

['Date', 'tournament_level', 'outdoor', 'player1_bet_odds', 'player2_bet_odds', 'match_id', 'player1_right_handed', 'player1_age', 'player2_right_handed', 'player2_age', 'best_of', 'player1_rank', 'player2_rank', 'player1_entry_LL', 'player1_entry_Q', 'player1_entry_WC', 'player2_entry_LL', 'player2_entry_Q', 'player2_entry_WC', 'player1_is_seeded', 'player2_is_seeded', 'w_ace_avg', 'l_ace_avg', 'w_CO_ace_avg', 'l_CO_ace_avg', 'w_df_avg', 'l_df_avg', 'w_CO_df_avg', 'l_CO_df_avg', 'w_2ndIn_avg', 'l_2ndIn_avg', 'w_CO_2ndIn_avg', 'l_CO_2ndIn_avg', 'player1_1st_serve_in_pct_avg', 'player2_1st_serve_in_pct_avg', 'player1_CO_1st_serve_in_pct_avg', 'player2_CO_1st_serve_in_pct_avg', 'player1_1st_serve_win_pct_avg', 'player2_1st_serve_win_pct_avg', 'player1_CO_1st_serve_win_pct_avg', 'player2_CO_1st_serve_win_pct_avg', 'player1_2nd_serve_in_pct_avg', 'player2_2nd_serve_in_pct_avg', 'player1_CO_2nd_serve_in_pct_avg', 'player2_CO_2nd_serve_in_pct_avg', 'player1_2nd_serve_win_pct_avg', 'player2_2

In [34]:
matches.to_csv("../data/matches.csv", index=False)