In [12]:
import warnings
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import time

warnings.filterwarnings('ignore')

df = pd.read_csv('../db/out/wta_s.csv')

df_2023 = df[df['date'].str.contains(
    '2023', na=False) | df['date'].str.contains('2022', na=False)]

df.drop(df_2023.index, inplace=True)

non_numeric_columns = df.select_dtypes(['object']).columns

df_numeric_only = df.drop(non_numeric_columns, axis=1)

df = df_numeric_only

df = df.drop(['match_id'], axis=1)

df = df.fillna(df.median())

df = df.fillna(df.median())


Y = pd.DataFrame(df['y'])
df = df.drop(['y'], axis=1)
X = df
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=45, stratify=Y)

non_numeric_columns = df_2023.select_dtypes(['object']).columns

df_numeric_only = df_2023.drop(non_numeric_columns, axis=1)

df_2023 = df_numeric_only

df_2023 = df_2023.drop(['match_id'], axis=1)

df_2023 = df_2023.fillna(df_2023.median())

y_2023 = pd.DataFrame(df_2023['y'])
x_2023 = df_2023.drop(['y'], axis=1)

In [13]:
# Gradient Boosting Basic parameters

from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(random_state=45)

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

basic_report = classification_report(y_test, y_pred, output_dict=True)
print("Trening")
print(basic_report)
basic_pre = basic_report['weighted avg']['precision']


pred_val = model.predict(x_2023)
basic_report = classification_report(y_2023, pred_val, output_dict=True)
print("Walidacja")
print(basic_report)
basic_pre_val = basic_report['weighted avg']['precision']

Trening
{'0': {'precision': 0.6767270288397049, 'recall': 0.7046089385474861, 'f1-score': 0.6903865891207663, 'support': 1432.0}, '1': {'precision': 0.6969914040114613, 'recall': 0.6687285223367697, 'f1-score': 0.682567520168362, 'support': 1455.0}, 'accuracy': 0.686525805334257, 'macro avg': {'precision': 0.6868592164255831, 'recall': 0.686668730442128, 'f1-score': 0.6864770546445642, 'support': 2887.0}, 'weighted avg': {'precision': 0.6869399370055884, 'recall': 0.686525805334257, 'f1-score': 0.6864459083705937, 'support': 2887.0}}
Walidacja
{'0': {'precision': 0.6473719228210246, 'recall': 0.6794692737430168, 'f1-score': 0.6630323679727428, 'support': 1432.0}, '1': {'precision': 0.6605029585798816, 'recall': 0.6275474349964862, 'f1-score': 0.6436036036036036, 'support': 1423.0}, 'accuracy': 0.6535901926444834, 'macro avg': {'precision': 0.6539374407004531, 'recall': 0.6535083543697515, 'f1-score': 0.6533179857881732, 'support': 2855.0}, 'weighted avg': {'precision': 0.65391674379645

In [14]:
# Gradient Boosting Hyperparameter Tuning
model = HistGradientBoostingClassifier(random_state=45)

param_grid = {
    'learning_rate': list(np.arange(0.1, 1.1, 0.1)),
    'n_estimators': list(range(100, 180, 10)),
    'criterion': ['friedman_mse', 'squared_error'],
    'loss': ['log_loss', 'exponential'],
}


search = GridSearchCV(model, param_grid,
                      cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

result = search.fit(X_train, y_train)

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Score: 0.6734019730397548
Best Hyperparameters: {'learning_rate': 0.1}


In [4]:
# Gradient Boosting with RFECV

model = GradientBoostingClassifier(
    loss='log_loss', learning_rate=0.1, n_estimators=110, criterion='friedman_mse')

cv = StratifiedKFold(3)

rfecv = RFECV(model, cv=cv, scoring='accuracy', step=1)

rfecv.fit(X_train, y_train)

y_pred = rfecv.predict(X_test)

print('Optimal number of features : %d' % rfecv.n_features_)

ranks = pd.DataFrame(
    rfecv.ranking_, index=X.columns, columns=['Rank'])

print(ranks.sort_values(by='Rank', ascending=True))

selected_features = ranks[ranks['Rank'] == 1].index.values.tolist()

opti_report = classification_report(y_test, y_pred, output_dict=True)
print("Trening")
opti_pre = opti_report['weighted avg']['precision']
opti_f1 = opti_report['weighted avg']['f1-score']
opti_auc = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[::, 1])


pred_val = model.predict(x_2023)
opti_val_report = classification_report(y_2023, pred_val, output_dict=True)
print("Walidacja")
print(opti_val_report)
opti_pre_val = opti_val_report['weighted avg']['precision']
opti_f1_val = opti_val_report['weighted avg']['f1-score']
opti_auc_val = metrics.roc_auc_score(
    y_2023, model.predict_proba(x_2023)[::, 1])

Optimal number of features : 21
                             Rank
games_played                    1
completeness                    1
serve_points_won                1
points_on_return                1
aceDf                           1
double_fault_probability        1
second_won_serve                1
first_won_serve                 1
bp_factor                       1
surface_wins                    1
win_percentage                  1
total_serve_points              1
ace_probability                 1
glicko                          1
rank_points                     1
rank                            1
aceDf_vs                        1
double_fault_probability_vs     1
ace_probability_vs              1
first_won_serve_vs              1
bp_factor_vs                    1
elo                             2
win_percentage_vs               3
ht                              4
second_won_serve_vs             5
Trening


NotFittedError: This GradientBoostingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [10]:
# rfecv.fit(X_train, y_train)

# y_pred = rfecv.predict(X_test)


opti_report = classification_report(y_test, y_pred, output_dict=True)
print("Trening")
opti_pre = opti_report['weighted avg']['precision']
opti_f1 = opti_report['weighted avg']['f1-score']
opti_auc = metrics.roc_auc_score(y_test, rfecv.predict_proba(X_test)[::, 1])


pred_val = rfecv.predict(x_2023)
opti_val_report = classification_report(y_2023, pred_val, output_dict=True)
print("Walidacja")
print(opti_val_report)
opti_pre_val = opti_val_report['weighted avg']['precision']
opti_f1_val = opti_val_report['weighted avg']['f1-score']
opti_auc_val = metrics.roc_auc_score(
    y_2023, rfecv.predict_proba(x_2023)[::, 1])

Trening
Walidacja
{'0': {'precision': 0.6510067114093959, 'recall': 0.6773743016759777, 'f1-score': 0.6639288158795347, 'support': 1432.0}, '1': {'precision': 0.6615384615384615, 'recall': 0.634574841883345, 'f1-score': 0.6477761836441894, 'support': 1423.0}, 'accuracy': 0.6560420315236427, 'macro avg': {'precision': 0.6562725864739287, 'recall': 0.6559745717796613, 'f1-score': 0.655852499761862, 'support': 2855.0}, 'weighted avg': {'precision': 0.6562559865175082, 'recall': 0.6560420315236427, 'f1-score': 0.6558779592522505, 'support': 2855.0}}


In [11]:
import json

filename = 'results.json'

final_results = {
    "prediction_basic": basic_pre,
    "v_prcision_basic": basic_pre_val,
    "prediction_optimized": opti_pre,
    "v_prcision_optimized": opti_pre_val,
    "f1_score_opt": opti_f1,
    "f1_score_opt_val": opti_f1_val,
    "auc_opt": float(opti_auc),
    "auc_opt_val": float(opti_auc_val),
    "selected_features": selected_features,
}


with open(filename, 'r') as file:
    data = json.load(file)


data['Wzmocnienie gradientowe'] = (final_results)


with open('results.json', 'w') as file:
    json.dump(data, file, indent=4)