In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import kstest
import joblib
import json
from sklearn.metrics import accuracy_score, brier_score_loss

In [88]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [89]:
best_features_log_nested = np.load("../logistic_regression/best_models/best_features_log_nested.npy")
best_model_log_nested= joblib.load("../logistic_regression/best_models/best_model_log_nested.pkl")
best_model_log_cros=joblib.load("../logistic_regression/best_models/best_model_log_cros.pkl")
best_features_log_cros = np.load("../logistic_regression/best_models/best_features_log_cros.npy")

In [None]:
matches['Date'] = pd.to_datetime(matches['Date'])
print(matches['Date'].head())
print(matches['Date'].dtype)

In [None]:
def combine_player_columns(df):
    combined_df = df.copy()
    difference_columns = {}

    
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df


matches = combine_player_columns(matches)


print("Columns after combining:")
print(matches.columns)

In [92]:
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [93]:
def scale_features(X):
    scaled_X = X.copy()
    for col in X.columns:
        if kstest(X[col], 'norm').pvalue > 0.05:
            
            scaler = StandardScaler()
        else:
            
            scaler = MinMaxScaler()
        scaled_X[col] = scaler.fit_transform(X[[col]])
    return scaled_X

## Wyniki dla wszystkich danych zbioru testowego

In [94]:
test_data = matches[matches["Date"].dt.year == 2023]

In [95]:
X_test_data = test_data.drop(columns=['target','Date', 'match_id'])
y_test_data = test_data['target']

In [96]:
X_scaled = scale_features(X_test_data)

In [97]:
X_selected_nested = X_scaled.iloc[:, best_features_log_nested]

In [98]:
y_pred_nested = best_model_log_nested.predict(X_selected_nested)

In [99]:
y_pred_nested_proba = best_model_log_nested.predict_proba(X_selected_nested)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_data, y_pred_nested))
print("Brier score: ", brier_score_loss(y_test_data, y_pred_nested_proba))
print("Log loss: ", log_loss(y_test_data, y_pred_nested_proba))

## Wyniki dla zawodników z pierwszej 50 rankingu

In [103]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])
matches=matches[(matches["player1_rank"]<50) & (matches["player2_rank"]<50)]
matches = combine_player_columns(matches)
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])
test_data_rank_50 = matches[matches["Date"].dt.year == 2023]

In [104]:
X_test_data_rank_50 = test_data_rank_50.drop(columns=['target','Date', 'match_id'])
y_test_data_rank_50 = test_data_rank_50['target']

In [105]:
X_scaled_rank_50 = scale_features(X_test_data_rank_50)

In [106]:
X_selected_rank_50_nested = X_scaled_rank_50.iloc[:, best_features_log_nested]

In [107]:
y_pred_nested_rank_50 = best_model_log_nested.predict(X_selected_rank_50_nested)

In [108]:
y_pred_nested_proba_rank_50 = best_model_log_nested.predict_proba(X_selected_rank_50_nested)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_data_rank_50, y_pred_nested_rank_50))
print("Brier score: ", brier_score_loss(y_test_data_rank_50, y_pred_nested_proba_rank_50))
print("Log loss: ", log_loss(y_test_data_rank_50, y_pred_nested_proba_rank_50))

## TOP 50% niepewności

In [111]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])
matches=matches[matches["Date"].dt.year == 2023]
matches=matches.nsmallest(int(len(matches) * 0.5), 'CO_uncertainty')
matches=matches[(matches["player1_rank"]<50) & (matches["player2_rank"]<50)]
matches = combine_player_columns(matches)
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])
test_certain_data = matches

In [112]:
X_test_certain_data = test_certain_data.drop(columns=['target','Date', 'match_id'])
y_test_certain_data = test_certain_data['target']

In [113]:
X_scaled_certain = scale_features(X_test_certain_data)

In [114]:
X_selected_certain_nested = X_scaled_certain.iloc[:, best_features_log_nested]

In [115]:
y_pred_nested_certain = best_model_log_nested.predict(X_selected_certain_nested)

In [116]:
y_pred_nested_proba_certain = best_model_log_nested.predict_proba(X_selected_certain_nested)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_certain_data, y_pred_nested_certain))
print("Brier score: ", brier_score_loss(y_test_certain_data, y_pred_nested_proba_certain))
print("Log loss: ", log_loss(y_test_certain_data,  y_pred_nested_proba_certain))

## Wyniki dla cros

In [None]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")
matches['Date'] = pd.to_datetime(matches['Date'])
matches=matches[matches["Date"].dt.year == 2023]
matches = combine_player_columns(matches)
matches=matches.drop(columns=['non_CO_uncertainty', 'CO_uncertainty',"outdoor", "tournament_level", "best_of", "Round_Num", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds","w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])
test_data=matches

In [118]:
X_selected_cros = X_scaled.iloc[:, best_features_log_cros]

In [119]:
y_pred_cros = best_model_log_cros.predict(X_selected_cros)

In [120]:
y_pred_cros_proba = best_model_log_cros.predict_proba(X_selected_cros)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_data, y_pred_cros))
print("Brier score: ", brier_score_loss(y_test_data, y_pred_cros_proba))
print("Log loss: ", log_loss(y_test_data, y_pred_cros_proba))

## cros rank 50

In [122]:
X_selected_cros_rank_50 = X_scaled_rank_50.iloc[:, best_features_log_cros]

In [123]:
y_pred_cros_rank_50 = best_model_log_cros.predict(X_selected_cros_rank_50)

In [124]:
y_pred_cros_proba_rank_50 = best_model_log_cros.predict_proba(X_selected_cros_rank_50)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_data_rank_50, y_pred_cros_rank_50))
print("Brier score: ", brier_score_loss(y_test_data_rank_50, y_pred_cros_proba_rank_50))
print("Log loss: ", log_loss(y_test_data_rank_50, y_pred_cros_proba_rank_50))

## cros niepewność

In [126]:
X_selected_cros_certain = X_scaled_certain.iloc[:, best_features_log_cros]

In [127]:
y_pred_cros_certain = best_model_log_cros.predict(X_selected_cros_certain)

In [128]:
y_pred_cros_proba_certain = best_model_log_cros.predict_proba(X_selected_cros_certain)[:,1]

In [None]:
print("Accuracy: ", accuracy_score(y_test_certain_data, y_pred_cros_certain))
print("Brier score: ", brier_score_loss(y_test_certain_data, y_pred_cros_proba_certain))
print("Log loss: ", log_loss(y_test_certain_data, y_pred_cros_proba_certain))