In [1]:
import pandas as pd
import numpy as np

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [3]:
def combine_player_columns(df):
    """
    Combines player1 and player2 columns into a single column representing their difference.
    """
    combined_df = df.copy()
    difference_columns = {}

    # Iterate over all columns to find matching player1/player2 columns
    for col in df.columns:
        if col.startswith('player1_'):
            suffix = col[len('player1_'):]
            player2_col = f'player2_{suffix}'
            
            if player2_col in df.columns:
                # Compute the difference
                diff_col_name = f'diff_{suffix}'
                combined_df[diff_col_name] = df[col] - df[player2_col]
                
                # Track columns to drop
                difference_columns[col] = diff_col_name
                difference_columns[player2_col] = diff_col_name

    # Drop original player1/player2 columns
    combined_df = combined_df.drop(columns=difference_columns.keys())
    
    return combined_df

# Example usage
combined_matches = combine_player_columns(matches)

print(f"Original DataFrame shape: {matches.shape}")
print(f"Combined DataFrame shape: {combined_matches.shape}")
print("Columns after combining:")
print(combined_matches.columns)
matches = combined_matches

Original DataFrame shape: (14111, 83)
Combined DataFrame shape: (14111, 48)
Columns after combining:
Index(['outdoor', 'tournament_level', 'best_of', 'num_CO_matches', 'Round_Num',
       'temperature_2m', 'relative_humidity_2m', 'windspeed_10m',
       'apparent_temperature', 'Surface_Clay', 'Surface_Grass', 'Surface_Hard',
       'target', 'diff_bet_odds', 'diff_right_handed', 'diff_age', 'diff_rank',
       'diff_entry_LL', 'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_CO_1st_serve_in_pct_avg', 'diff_1st_serve_in_pct_avg',
       'diff_CO_1st_serve_win_pct_avg', 'diff_1st_serve_win_pct_avg',
       'diff_CO_2nd_serve_win_pct_avg', 'diff_2nd_serve_win_pct_avg',
       'diff_CO_serve_games_win_pct_avg', 'diff_serve_games_win_pct_avg',
       'diff_CO_ace_avg', 'diff_ace_avg', 'diff_CO_df_avg', 'diff_df_avg',
       'diff_CO_1st_serve_return_win_pct_avg',
       'diff_1st_serve_return_win_pct_avg', 'diff_elo', 'diff_surface_elo',
       'diff_blended_elo', 'diff_fatig

### Pomysł na usuwanie zkorelowanych kolumn: przyniósł gorsze wyniki

In [4]:
# def remove_correlated_features(data, target_column, threshold=0.9):
#     correlation_matrix = data.corr()
# 
#     # Correlation of each feature with the target
#     target_corr = correlation_matrix[target_column].drop(index=target_column)
# 
#     # Find highly correlated feature pairs
#     corr_pairs = correlation_matrix.abs().unstack().sort_values(ascending=False).reset_index()
#     corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
# 
#     # Filter for high correlations (above threshold)
#     high_corr_pairs = corr_pairs[
#         (corr_pairs["Correlation"] > threshold) & 
#         (corr_pairs["Feature 1"] != corr_pairs["Feature 2"])
#     ]
# 
#     # Remove duplicates (keep one direction of each pair)
#     high_corr_pairs = high_corr_pairs[
#         high_corr_pairs["Feature 1"] < high_corr_pairs["Feature 2"]
#     ]
# 
#     # Features to drop
#     to_drop = set()
#     for _, row in high_corr_pairs.iterrows():
#         feature1, feature2 = row["Feature 1"], row["Feature 2"]
#         # Retain the feature with higher correlation to the target
#         if abs(target_corr[feature1]) >= abs(target_corr[feature2]):
#             to_drop.add(feature2)
#         else:
#             to_drop.add(feature1)
# 
#     # Return DataFrame with reduced features
#     reduced_data = data.drop(columns=to_drop)
#     return reduced_data, list(to_drop)
# 
# threshold = 0.9
# target_column = "target"  
# reduced_data, dropped_features = remove_correlated_features(matches, target_column, threshold)
# 
# print(f"Dropped features: {dropped_features}")
# print(f"Reduced DataFrame shape: {reduced_data.shape}")


In [5]:
# reduced_data = matches.drop(columns=["diff_rank", "diff_elo", "diff_surface_elo"], axis=1)

### Próba zrobienia czegoś z CO i nie CO, to jest bez sensu wgl ale jeszcze coś można pokminić co z tym zrobić

In [6]:
# import pandas as pd
# 
# def handle_co_stats(df):
#     co_columns = [
#         'CO_1st_serve_in_p_diff', '1st_serve_in_p_diff',
#         'CO_1st_serve_win_p_diff', '1st_serve_win_p_diff',
#         'CO_2nd_serve_win_p_diff', '2nd_serve_win_p_diff',
#         'CO_serve_games_win_p_diff', 'serve_games_win_p_diff',
#         'CO_avg_ace_per_match_diff', 'avg_ace_per_match_diff',
#         'CO_avg_df_per_match_diff', 'avg_df_per_match_diff'
#     ]
# 
#     # Process pairs of CO and non-CO stats
#     adjusted_columns = {}
#     for i in range(0, len(co_columns), 2):
#         co_col = co_columns[i]
#         non_co_col = co_columns[i + 1]
#         adjusted_col_name = f"adjusted_{non_co_col}"
# 
#         # Calculate adjusted stats
#         df[adjusted_col_name] = df['CO_active'] * (df[co_col] - df[non_co_col]) + df[non_co_col]
# 
#         # Track original columns to drop
#         adjusted_columns[co_col] = adjusted_col_name
#         adjusted_columns[non_co_col] = adjusted_col_name
# 
#     # Drop original CO and non-CO columns
#     df = df.drop(columns=list(adjusted_columns.keys()))
# 
#     return df
# 
# # Apply the transformation
# reduced_data = handle_co_stats(reduced_data)
# 
# print(f"Adjusted DataFrame shape: {reduced_data.shape}")
# print("Adjusted columns:")
# print(reduced_data.columns)

In [7]:
# reduced_data = reduced_data.drop(columns=["CO_active"])

In [10]:
matches.columns

Index(['outdoor', 'tournament_level', 'best_of', 'num_CO_matches', 'Round_Num',
       'temperature_2m', 'relative_humidity_2m', 'windspeed_10m',
       'apparent_temperature', 'Surface_Clay', 'Surface_Grass', 'Surface_Hard',
       'target', 'diff_bet_odds', 'diff_right_handed', 'diff_age', 'diff_rank',
       'diff_entry_LL', 'diff_entry_Q', 'diff_entry_WC', 'diff_is_seeded',
       'diff_CO_1st_serve_in_pct_avg', 'diff_1st_serve_in_pct_avg',
       'diff_CO_1st_serve_win_pct_avg', 'diff_1st_serve_win_pct_avg',
       'diff_CO_2nd_serve_win_pct_avg', 'diff_2nd_serve_win_pct_avg',
       'diff_CO_serve_games_win_pct_avg', 'diff_serve_games_win_pct_avg',
       'diff_CO_ace_avg', 'diff_ace_avg', 'diff_CO_df_avg', 'diff_df_avg',
       'diff_CO_1st_serve_return_win_pct_avg',
       'diff_1st_serve_return_win_pct_avg', 'diff_elo', 'diff_surface_elo',
       'diff_blended_elo', 'diff_fatigue_score', 'diff_h2h_wins', 'diff_home',
       'diff_injury_score', 'diff_win_pct_last_10',
       '

### Splitowanie na test i train

In [25]:
from sklearn.model_selection import train_test_split


X = matches.drop(columns=['target', "outdoor", "tournament_level", "best_of", "Round_Num", "num_CO_matches", "Surface_Clay", "Surface_Grass", "Surface_Hard", "temperature_2m", "relative_humidity_2m", "windspeed_10m",  "apparent_temperature", "diff_right_handed","diff_bet_odds"])
y = matches['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Verify the split
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (9877, 33)
y_train shape: (9877,)
X_test shape: (4234, 33)
y_test shape: (4234,)


In [26]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
print(X_train.isnull().sum().sum())  # Total NaN in X_train
print(X_test.isnull().sum().sum())  # Total NaN in X_test

print(X_train.dtypes)
print(X_test.dtypes)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

0
0
diff_age                                float64
diff_rank                               float64
diff_entry_LL                             int64
diff_entry_Q                              int64
diff_entry_WC                             int64
diff_is_seeded                            int64
diff_CO_1st_serve_in_pct_avg            float64
diff_1st_serve_in_pct_avg               float64
diff_CO_1st_serve_win_pct_avg           float64
diff_1st_serve_win_pct_avg              float64
diff_CO_2nd_serve_win_pct_avg           float64
diff_2nd_serve_win_pct_avg              float64
diff_CO_serve_games_win_pct_avg         float64
diff_serve_games_win_pct_avg            float64
diff_CO_ace_avg                         float64
diff_ace_avg                            float64
diff_CO_df_avg                          float64
diff_df_avg                             float64
diff_CO_1st_serve_return_win_pct_avg    float64
diff_1st_serve_return_win_pct_avg       float64
diff_elo                            

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV, RFE
import matplotlib.pyplot as plt

In [16]:
import warnings

# Reset all warnings to their default state
warnings.resetwarnings()

# Re-enable specific warnings like ConvergenceWarning
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("default", ConvergenceWarning)

In [44]:
Standarize_var=[
'diff_total_wins_tournament_history',
'diff_total_losses_tournament_history',
'diff_fatigue_score',
'diff_injury_score',
'diff_1st_serve_return_win_pct_avg',
'diff_CO_1st_serve_return_win_pct_avg',
'diff_df_avg',
'diff_CO_df_avg',
'diff_ace_avg',
'diff_CO_ace_avg',
'diff_serve_games_win_pct_avg',
'diff_CO_serve_games_win_pct_avg',
'diff_1st_serve_win_pct_avg',
'diff_CO_1st_serve_win_pct_avg',
'diff_1st_serve_in_pct_avg',
'diff_CO_1st_serve_in_pct_avg',
 'diff_Set_Diff_Tournament',
 'diff_Game_Diff_Tournament',
 #'diff_right_handed',
 'diff_age',
 'diff_rank',
 #'diff_entry_LL',
 #'diff_entry_Q',
 #'diff_entry_WC',
 #'diff_is_seeded',
 'diff_h2h_wins',
 #'diff_home',
 'diff_win_pct_last_10',
 'diff_win_pct_last_10_surface',
 'diff_elo',
 'diff_surface_elo',
 'diff_blended_elo',
 'diff_CO_2nd_serve_win_pct_avg',
 'diff_2nd_serve_win_pct_avg',
   ]


### Próbowane było PCA ale słabe wyniki

In [28]:
# pca = PCA()
# X_train_pca = pca.fit_transform(X_train)
# # skumulowany procent wariancji
# cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# # liczba komponentów, które wyjaśniają 95% wariancji
# optimal_components = np.argmax(cumulative_variance >= 0.95) + 1
# 
# print(f"Optimal number of components to explain 95% variance: {optimal_components}")
# 
# 
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
# plt.axvline(x=optimal_components, color='r', linestyle='--', label=f'{optimal_components} components')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('PCA Explained Variance')
# plt.legend()
# plt.show()

### Próbowane było RFE ale słabe wyniki

In [29]:
# base_model = LogisticRegression(random_state=42, max_iter=1000)
# base_model_2 = XGBClassifier(random_state=42)
# # RFECV
# rfecv = RFECV(estimator=base_model, step=1, cv=5, scoring='accuracy')  
# rfecv.fit(X_train_reg, y_train_reg)
# rfecv_2 = RFECV(estimator=base_model_2, step=1, cv=5, scoring='accuracy')
# rfecv_2.fit(X_train, y_train)
# 
# # optymalna dla regresji logistycznej
# optimal_features = rfecv.n_features_
# print(f"Optimal number of features: {optimal_features}")
# # optymalna dla xgboost
# optimal_features_2 = rfecv_2.n_features_
# print(f"Optimal number of features: {optimal_features_2}")
# 
# 
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'], marker='o', linestyle='--')
# plt.axvline(x=optimal_features, color='r', linestyle='--', label=f'{optimal_features} features')
# plt.xlabel('Number of Features Selected')
# plt.ylabel('Cross-Validation Score')
# plt.title('RFECV - Optimal Number of Features for Logistic Regression')
# plt.legend()
# plt.show()
# 
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(rfecv_2.cv_results_['mean_test_score']) + 1), rfecv_2.cv_results_['mean_test_score'], marker='o', linestyle='--')
# plt.axvline(x=optimal_features_2, color='r', linestyle='--', label=f'{optimal_features_2} features')
# plt.xlabel('Number of Features Selected')
# plt.ylabel('Cross-Validation Score')
# plt.title('RFECV - Optimal Number of Features for XGBoost')
# plt.legend()
# plt.show()

### Stare próby puszczania modeli

In [30]:
# # RandomizedSearchCV RFE Logistic Regression
# rfe_logreg_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('feature_selection', RFE(estimator=LogisticRegression(max_iter=4000, random_state=42), n_features_to_select=optimal_features)),
#     ('classifier', LogisticRegression(max_iter=4000, random_state=42))
# ])
# 
# param_distributions_logreg = { 
#     'classifier__C': np.logspace(-1, 1, 5),                              
#     'classifier__penalty': ['l2'],
#     'classifier__solver': ['lbfgs', 'liblinear', 'saga']                                         
# }
# 
# random_logreg_search = RandomizedSearchCV(
#     rfe_logreg_pipeline, param_distributions_logreg, 
#     n_iter=200, cv=5, scoring='accuracy', random_state=42, n_jobs=-1
# )
# 
# 
# # RandomizedSearchCV RFE XGBoost
# rfe_xgb_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('feature_selection', RFE(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), n_features_to_select=optimal_features)),
#     ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
# ])
# 
# param_distributions_xgb = { 
#     'classifier__learning_rate': np.linspace(0.01, 0.2, 10),              
#     'classifier__max_depth': [3, 5, 7, 9],                               
#     'classifier__n_estimators': [50, 100, 200, 300]                      
# }
# 
# random_xgb_search = RandomizedSearchCV(
#     rfe_xgb_pipeline, param_distributions_xgb, 
#     n_iter=50, cv=5, scoring='accuracy', random_state=42
# )


In [31]:
# random_logreg_search.fit(X_train_reg, y_train_reg)
# 
# print(f"Best parameters for Logistic Regression with RFE: {random_logreg_search.best_params_}")
# print(f"Best Logistic Regression score: {random_logreg_search.best_score_}")
# print(f"Test score: {random_logreg_search.score(X_test_reg, y_test_reg)}")

In [32]:
# random_xgb_search.fit(X_train, y_train)
# 
# print(f"Best parameters for XGBoost with RFE: {random_xgb_search.best_params_}")
# print(f"Best XGBoost score: {random_xgb_search.best_score_}")
# print(f"Test score: {random_xgb_search.score(X_test, y_test)}")

## Aktualnie najlepszy model log reg

In [45]:
from sklearn.model_selection import  StratifiedKFold

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('standard_scaler', Pipeline([
            ('scaler', StandardScaler()),
        ]), Standarize_var),
    ],
    remainder='passthrough'  # Keep unprocessed features as-is
)

# Logistic Regression pipeline with RFE
custom_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=4000, random_state=42, fit_intercept=False))
])

param_distributions_logreg = [
    {
        'classifier__C': np.logspace(-2, 2, 50),
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs'],  # lbfgs supports only l2
        'classifier__class_weight': [None, 'balanced']
    },
    {
        'classifier__C': np.logspace(-2, 2, 50),
        'classifier__penalty': ['l1'],
        'classifier__solver': ['liblinear', 'saga'],  # liblinear and saga support l1
        'classifier__class_weight': [None, 'balanced']
    }
]

# RandomizedSearchCV with stratified folds
random_logreg_search = GridSearchCV(
    custom_pipeline, param_distributions_logreg, 
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), 
    scoring='accuracy', n_jobs=-1
)

In [46]:
random_logreg_search.fit(X_train, y_train)

print(f"Best parameters for Logistic Regression: {random_logreg_search.best_params_}")
print(f"Best Logistic Regression score: {random_logreg_search.best_score_}")
print(f"Test score: {random_logreg_search.score(X_test, y_test)}")

Best parameters for Logistic Regression: {'classifier__C': np.float64(0.054286754393238594), 'classifier__class_weight': 'balanced', 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best Logistic Regression score: 0.6526274770395711
Test score: 0.6639111950873878
