In [None]:
import warnings
import json
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn import metrics
import numpy as np
import pandas as pd
import time

start_time = time.time()


warnings.filterwarnings('ignore')

df = pd.read_csv('../db/out/wta_s.csv')

In [None]:
df_2023 = df[df['date'].str.contains(
    '2023', na=False) | df['date'].str.contains('2022', na=False)]

df.drop(df_2023.index, inplace=True)

In [None]:
# normalizacja
non_numeric_columns = df.select_dtypes(['object']).columns

df_numeric_only = df.drop(non_numeric_columns, axis=1)

df = df_numeric_only

df = df.drop(['match_id'], axis=1)

df = df.fillna(df.median())


def normalize(dff):
    result = dff.copy()
    for feature_name in dff.columns:
        max_value = dff[feature_name].max()
        min_value = dff[feature_name].min()
        result[feature_name] = (
            dff[feature_name] - min_value) / (max_value - min_value)
    return result


df = normalize(df)
# df.info()
df = df.fillna(df.median())

In [None]:
non_numeric_columns = df_2023.select_dtypes(['object']).columns

df_numeric_only = df_2023.drop(non_numeric_columns, axis=1)

df_2023 = df_numeric_only

df_2023 = df_2023.drop(['match_id'], axis=1)

df_2023 = normalize(df_2023)

df_2023 = df_2023.fillna(df_2023.median())

y_2023 = pd.DataFrame(df_2023['y'])
x_2023 = df_2023.drop(['y'], axis=1)

In [None]:
# trining and testing
from sklearn.model_selection import train_test_split

Y = pd.DataFrame(df['y'])
df = df.drop(['y'], axis=1)
X = df
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=45, stratify=Y)

In [None]:
# Logistic regression Basic hiperparameters
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=45, n_jobs=-1)

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)


basic_report = classification_report(y_test, y_pred, output_dict=True)
print("Trening")
print(basic_report)
basic_pre = basic_report['weighted avg']['precision']



pred_val = model.predict(x_2023)
basic_report = classification_report(y_2023, pred_val, output_dict=True)
print("Walidacja")
print(basic_report)
basic_pre_val = basic_report['weighted avg']['precision']

auc_basic = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[::, 1])
print("AUC: ", auc_basic)

auc_val = metrics.roc_auc_score(y_2023, model.predict_proba(x_2023)[::, 1])
print("AUC: ", auc_val)


# cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# # cnf_matrix

# disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
# disp.plot()
# plt.ylabel('Wartosci prawdziwe')
# plt.xlabel('Predykcje')
# plt.grid(False)
# plt.show()

# basic_report = classification_report(y_test, y_pred, output_dict=True)

# y_pred_proba = result.predict_proba(X_test)[::, 1]
# fpr_basic, tpr_basic, _ = metrics.roc_curve(y_test,  y_pred_proba)
# auc_basic = metrics.roc_auc_score(y_test, y_pred_proba)
# print("AUC: ", auc_basic)

In [None]:
print(basic_pre_val)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
model = LogisticRegression(penalty='l1', random_state=45, n_jobs=-1)

param_grid = {
    'C': list(range(0, 15, 1)),
    'solver': ['liblinear', 'saga'],
    # 'solver': ['lbfgs', 'liblinear', 'sag', 'newton-cg', 'saga'],
}

search = GridSearchCV(model, param_grid,
                            cv=10, scoring='accuracy', n_jobs=-1, verbose=1)

result = search.fit(X_train, y_train)

print('Najlepszy wynik: %s' % result.best_score_)
print('Najlepsze hiperparametry: %s' % result.best_params_)

# cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# # cnf_matrix

# disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix)
# disp.plot()
# plt.show()

# y_pred_proba = result.predict_proba(X_test)[::, 1]
# fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
# auc = metrics.roc_auc_score(y_test, y_pred_proba)
# plt.plot(fpr, tpr, label="data 1, auc="+str(auc))
# plt.legend(loc=4)
# plt.show()

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

model = LogisticRegression(C=6, solver='lbfgs', random_state=45)
cv = StratifiedKFold(3)

rfecv = RFECV(model, cv=cv, scoring='accuracy', step=1, n_jobs=-1)

rfecv.fit(X_train, y_train)

y_pred = rfecv.predict(X_test)

print('Optimal number of features : %d' % rfecv.n_features_)

ranks = pd.DataFrame(
    rfecv.ranking_, index=X.columns, columns=['Rank'])

print(ranks.sort_values(by='Rank', ascending=True))

selected_features = ranks[ranks['Rank'] == 1].index.values.tolist()

# rfecv.support_rfecv_df = pd.DataFrame(rfecv.ranking_, index=X.columns, columns=[
#                                       'Rank']).sort_values(by='Rank', ascending=True)
plt.style.use('ggplot')
plt.figure(figsize=(12, 6))
plt.xlabel('Liczba wykorzystanych cech')
plt.ylabel('Precyzja')
plt.plot(range(1, len(rfecv.cv_results_[
         "mean_test_score"]) + 1), rfecv.cv_results_["mean_test_score"],color='#0088ff')
plt.show()

print(classification_report(y_test, y_pred))



In [None]:
X = df[selected_features]
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=45, stratify=Y)

In [None]:
model = LogisticRegression(C=6, solver='lbfgs', random_state=45)

result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)


y_pred_proba = result.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)



opti_report = classification_report(y_test, y_pred, output_dict=True)
print("Trening")
opti_pre = opti_report['weighted avg']['precision']
opti_f1 = opti_report['weighted avg']['f1-score']
opti_auc = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[::, 1])



pred_val = model.predict(x_2023)
opti_val_report = classification_report(y_2023, pred_val, output_dict=True)
print("Walidacja")
print(opti_val_report)
opti_pre_val = opti_val_report['weighted avg']['precision']
opti_f1_val = opti_val_report['weighted avg']['f1-score']
opti_auc_val = metrics.roc_auc_score(y_2023, model.predict_proba(x_2023)[::, 1])



auc_basic = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[::, 1])
print("AUC: ", auc_basic)

auc_val = metrics.roc_auc_score(y_2023, model.predict_proba(x_2023)[::, 1])
print("AUC: ", auc_val)


# plt.grid(True)
# plt.plot(fpr, tpr, label="Regresja Logistyczna, auc="+str(auc), color='red')
# plt.plot(fpr_basic, tpr_basic, label="Regresja Logistyczna po optymalziacji, auc=" +
#          str(auc_basic), color='blue')
# plt.legend(loc=4)
# plt.show()

In [None]:
import json

filename = 'results.json'

final_results = {
    "prediction_basic": basic_pre,
    "v_prcision_basic": basic_pre_val,
    "prediction_optimized": opti_pre,
    "v_prcision_optimized": opti_pre_val,
    "f1_score_opt": opti_f1,
    "f1_score_opt_val": opti_f1_val,
    "auc_opt": float(opti_auc),
    "auc_opt_val": float(opti_auc_val),
    "selected_features": selected_features,
}


with open(filename, 'r') as file:
    data = json.load(file)


data['Regresja logistyczna'] = (final_results)


with open('results.json', 'w') as file:
    json.dump(data, file, indent=4)