## logistic regression code for the webapp

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import os

cancer_temp = load_breast_cancer()
print("Available feature names from dataset:", cancer_temp.feature_names)

best_feature_names = [
    'radius error',
    'worst texture',
    'worst area',
    'worst smoothness',
    'worst concave points'
]
print(f"\nUsing these features for the model: {best_feature_names}")

N_ITERATIONS = 50  ## of random states to try for train/test split
MODEL_MAX_ITER = 200 


breast_cancer_dataset = load_breast_cancer()
df = pd.DataFrame(data=breast_cancer_dataset.data, columns=breast_cancer_dataset.feature_names)
df['target'] = breast_cancer_dataset.target

# --- Select Specified Features ---
X_selected = df[best_feature_names]
y_selected = df['target']


# chris's approach
best_model_overall = None
best_scaler_overall = None # need to save scalar for the webapp[joblib]
best_accuracy_overall = 0
best_random_state_overall = 0

print(f"\nStarting search for the best model over {N_ITERATIONS} random splits...")

for i in range(N_ITERATIONS):
    current_random_state = np.random.randint(0, 10000) # Generate a random state

    # Split data into 75:25 for this iteration
    X_train_iter, X_test_iter, y_train_iter, y_test_iter = train_test_split(
        X_selected, y_selected, train_size=0.75, random_state=current_random_state
    )

    # Scale data: scaler is FIT ONLY on X_train_iter for this specific split
    scaler_iter = StandardScaler()
    X_train_scaled_iter = scaler_iter.fit_transform(X_train_iter)
    X_test_scaled_iter = scaler_iter.transform(X_test_iter) # Transform test data with the same scaler

    model_iter = LogisticRegression(max_iter=MODEL_MAX_ITER, solver='liblinear', random_state=current_random_state)
    model_iter.fit(X_train_scaled_iter, y_train_iter)

    # Evaluate the model on the test data for this iteration
    y_pred_iter = model_iter.predict(X_test_scaled_iter)
    accuracy_iter = accuracy_score(y_test_iter, y_pred_iter)

    if (i + 1) % 10 == 0:
        print(f"Iteration {i+1}/{N_ITERATIONS}, Random State: {current_random_state}, Accuracy: {accuracy_iter:.4f}")

    if accuracy_iter > best_accuracy_overall:
        best_accuracy_overall = accuracy_iter
        best_model_overall = model_iter
        best_scaler_overall = scaler_iter # Save the scaler fit on this training data
        best_random_state_overall = current_random_state
        print(f"  -> New best accuracy found: {best_accuracy_overall:.4f} with random state {best_random_state_overall}")


print("\n--- Search Complete ---")
print(f"Best Overall Accuracy: {best_accuracy_overall:.6f}")
print(f"Achieved with Random State for train_test_split: {best_random_state_overall}")
if best_model_overall:
    print(f"Best Model Coefficients: {best_model_overall.coef_}")
    print(f"Best Model Intercept: {best_model_overall.intercept_}")
else:
    print("No model was successfully trained.")

# --- this part of the code is to save the best model to use in the webapp, its scaler, and feature names ---
if best_model_overall and best_scaler_overall:
    SAVE_PATH = "saved_models/"  # Relative to your notebook's location
    os.makedirs(SAVE_PATH, exist_ok=True)

    # filenames for this specific "best logistic regression" model
    feature_names_filename_save = os.path.join(SAVE_PATH, "best_logreg_feature_names.joblib")
    scaler_filename_save = os.path.join(SAVE_PATH, "best_logreg_scaler.joblib")
    model_filename_save = os.path.join(SAVE_PATH, "best_logreg_model.joblib")

    joblib.dump(best_feature_names, feature_names_filename_save)
    print(f"Feature names for the best model saved to: {feature_names_filename_save}")

    # scalar for the best model
    joblib.dump(best_scaler_overall, scaler_filename_save)
    print(f"Scaler for the best model saved to: {scaler_filename_save}")

    joblib.dump(best_model_overall, model_filename_save)
    print(f"Best logistic regression model saved to: {model_filename_save}")

    print("\n--- Model, Scaler, and Feature Names Saving Complete ---")
else:
    print("\nSkipping saving as no best model was found (this shouldn't happen if data loads correctly).")


Available feature names from dataset: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Using these features for the model: ['radius error', 'worst texture', 'worst area', 'worst smoothness', 'worst concave points']

Starting search for the best model over 50 random splits...
  -> New best accuracy found: 0.9720 with random state 6281
  -> New best accuracy found: 0.9860 with random state 6405
  -> New best accuracy found: 0.9930 with random state 4102
Iteration 10/50, Random State: 8413, 