In [26]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Import project modules
from utils.data_loader import load_split, prepare_features_target
from validation.validate_model import cross_validate_model

print("âœ“ All imports successful!")


âœ“ All imports successful!


In [27]:
# Load training data
train_df = load_split('train', data_dir='dataset/splits')
X_train, y_train = prepare_features_target(train_df, target_col='Fault')

print(f"Training data shape: {X_train.shape}")
print(f"Target distribution:\n{y_train.value_counts()}")
print(f"\nOriginal class distribution:\n{y_train.value_counts(normalize=True)}")


Training data shape: (941, 9)
Target distribution:
Fault
0    652
1    289
Name: count, dtype: int64

Original class distribution:
Fault
0    0.69288
1    0.30712
Name: proportion, dtype: float64


In [28]:
# Perform K-Fold Cross-Validation (k=5)
# Note: SMOTE will be applied INSIDE each fold iteration only to the training fold
# The validation fold remains unchanged (original data)
# StandardScaler is fitted on training fold and applied to both training and validation

# Create a model instance for cross-validation (will be trained inside each fold)
from sklearn.linear_model import LogisticRegression
cv_model = LogisticRegression(
    C=1.0,
    max_iter=1000,
    solver='lbfgs',
    random_state=42
)

# Perform cross-validation
# use_smote=True ensures SMOTE is applied to training data in each fold
# apply_scaling=True ensures StandardScaler is applied per fold
cv_results = cross_validate_model(
    model=cv_model,
    X=X_train,  # Use original training data (not augmented)
    y=y_train,  # Use original training labels (not augmented)
    cv=5,  # k=5 folds
    metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
    random_state=42,
    use_smote=True,  # SMOTE applied inside each fold to training data only
    apply_scaling=True  # StandardScaler applied per fold
)

print("âœ“ Cross-validation completed!")
print(f"\nNumber of folds: {cv_results['cv_folds']}")


âœ“ Cross-validation completed!

Number of folds: 5


In [29]:
# Step 7: Display Cross-Validation Results
print("=" * 60)
print("K-FOLD CROSS-VALIDATION RESULTS (k=5)")
print("=" * 60)
print(f"\n{'Metric':<15} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12}")
print("-" * 60)

for metric in cv_results['mean'].keys():
    mean_val = cv_results['mean'][metric]
    std_val = cv_results['std'][metric]
    metric_values = [fold[metric] for fold in cv_results['fold_results']]
    min_val = np.min(metric_values)
    max_val = np.max(metric_values)
    print(f"{metric:<15} {mean_val:<12.4f} {std_val:<12.4f} {min_val:<12.4f} {max_val:<12.4f}")

print("\n" + "=" * 60)
print("PER-FOLD RESULTS:")
print("=" * 60)
for fold_result in cv_results['fold_results']:
    print(f"\nFold {fold_result['fold']}:")
    for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
        if metric in fold_result:
            print(f"  {metric}: {fold_result[metric]:.4f}")


K-FOLD CROSS-VALIDATION RESULTS (k=5)

Metric          Mean         Std          Min          Max         
------------------------------------------------------------
accuracy        0.5143       0.0281       0.4894       0.5661      
precision       0.5857       0.0362       0.5458       0.6490      
recall          0.5143       0.0281       0.4894       0.5661      
f1              0.5323       0.0273       0.5080       0.5827      
roc_auc         0.5361       0.0492       0.4829       0.6242      

PER-FOLD RESULTS:

Fold 1:
  accuracy: 0.5661
  precision: 0.6490
  recall: 0.5661
  f1: 0.5827
  roc_auc: 0.6242

Fold 2:
  accuracy: 0.4947
  precision: 0.5600
  recall: 0.4947
  f1: 0.5148
  roc_auc: 0.4829

Fold 3:
  accuracy: 0.5000
  precision: 0.5458
  recall: 0.5000
  f1: 0.5167
  roc_auc: 0.5020

Fold 4:
  accuracy: 0.5213
  precision: 0.5989
  recall: 0.5213
  f1: 0.5394
  roc_auc: 0.5228

Fold 5:
  accuracy: 0.4894
  precision: 0.5750
  recall: 0.4894
  f1: 0.5080
  roc_auc: 

In [30]:
# Generate cross-validation report
from results.report_cross_validation import report_cross_validation

cv_summary, cv_report_path = report_cross_validation(
    cv_results=cv_results,
    model_name="logistic_regression",
    output_path=None,
    title="Logistic Regression Cross-Validation Report"
)

print("\nâœ“ Cross-validation report generated!")
print(f"ðŸ“„ Report saved to: {cv_report_path}")



âœ“ Cross-validation report generated!
ðŸ“„ Report saved to: /home/ari/Collage/04-Forth_Year/Preimer_Semestre/AM/Final_Proj/Machine-Learning-Project/src/results/logistic_regression_cv_report_20251119_013406.md
