In [None]:
from CCD_implementation import LogRegCCD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, balanced_accuracy_score, f1_score
import numpy as np
import pandas as pd
from preprocessing import split

In [None]:
path = ''
df = pd.read(path)
# Placeholder for users own preprocessing. Remember to rename y column as "target".
X, y, X_train, X_test, y_train, y_test = split(df)

Functions for comparison of both algorithms

In [None]:
def evaluate_lr_ccd(X_train, y_train, X_test, y_test, max_iter=50, weights = True):
    model=LogRegCCD()
    best_lambda=model.plot(X=X_train, y=y_train, max_iter=max_iter,weights=weights,measure='balanced accuracy')
    model.plot_coefficients(X=X_train, y=y_train, max_iter=max_iter,weights=weights)
    print("Best lambda",best_lambda)
    model.fit(X_train, y_train, max_iter=max_iter,weights=weights,user_lambda=best_lambda, fit_intercept=True,plots = True)

    
    y_proba = model.predict_proba(X_test)

    roc_auc = model.ROC_AUC(y_test, y_proba)
    prc_auc = model.PR_AUC(y_test, y_proba)
    f_score = model.validate(X_test,y_test, 'F-score')
    balanced_accuracy = model.validate(X_test,y_test, 'balanced accuracy')

    print("Performance on test set for custom logistic regression:")
    print("ROC AUC:", roc_auc)
    print("Recall-Precision AUC:", prc_auc)
    print("F-score:", f_score)
    print("Balanced Accuracy:", balanced_accuracy)

    print("Coefficient values obtained (custom model):")
    print("Coefficients:", model.B)
    print("Intercept:", model.B0)

    return model.B

def evaluate_lr_sklearn(X_train, y_train, X_test, y_test):
    model_sklearn = LogisticRegression(penalty=None)
    model_sklearn.fit(X_train, y_train)

    y_proba_sklearn = model_sklearn.predict_proba(X_test)[:, 1]
    y_pred_sklearn = model_sklearn.predict(X_test)

    roc_auc_sklearn = roc_auc_score(y_test, y_proba_sklearn)
    prc_auc_sklearn = average_precision_score(y_test, y_proba_sklearn)
    f_score_sklearn = f1_score(y_test, y_pred_sklearn)
    balanced_accuracy_sklearn = balanced_accuracy_score(y_test, y_pred_sklearn)

    print("Performance on test set for sklearn logistic regression:")
    print("ROC AUC:", roc_auc_sklearn)
    print("Recall-Precision AUC:", prc_auc_sklearn)
    print("F-score:", f_score_sklearn)
    print("Balanced Accuracy:", balanced_accuracy_sklearn)

    print("Coefficient values obtained (sklearn):")
    print("Coefficients:", model_sklearn.coef_)
    print("Intercept:", model_sklearn.intercept_)
    return model_sklearn.coef_

In [None]:
def compare_coefficients(coef_ccd, coef_sk, epsilon=0.01):
    coef1 = np.array(coef_ccd)
    coef2 = np.array(coef_sk)
    norm_coef1 = np.linalg.norm(coef1, axis=0)
    norm_coef2 = np.linalg.norm(coef2, axis=0)
    avg_norm1 = np.mean(norm_coef1)
    avg_norm2 = np.mean(norm_coef2)
    small_coef1 = np.sum(np.abs(coef1) < epsilon)
    small_coef2 = np.sum(np.abs(coef2) < epsilon)

    print(f"Average norm of coefficients from model ccd: {avg_norm1:.4f}")
    print(f"Average norm of coefficients from model sk: {avg_norm2:.4f}")
    print(f"Number of coefficients smaller than {epsilon} in model ccd: {small_coef1}")
    print(f"Number of coefficients smaller than {epsilon} in model sk: {small_coef2}")

In [None]:
coef_ccd=evaluate_lr_ccd(X_train, y_train, X_test, y_test)

In [None]:
coef_sk=evaluate_lr_sklearn(X_train, y_train, X_test, y_test)

In [None]:
compare_coefficients(coef_ccd, coef_sk, epsilon=0.1)

In [None]:
model= LogRegCCD().fit(X_train,y_test)
model.validate(X_test,y_test, measure = "F-score")