In [36]:
cd

C:\Users\santiago1


In [34]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report
from sklearn.inspection import permutation_importance

# Define directories
main_dir = r"C:\Users\santiago1\Documents\Santiago\Project\Datasets\55_Datasets (Frequency)\Training-Testing"
secondary_dir = r"C:\Users\santiago1\Documents\Santiago\Project\Datasets\55_Datasets (Frequency)\Validation"
output_file = r"C:\Users\santiago1\Documents\Santiago\Project\Datasets\results_summary.csv"

# Models to test
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Define k for K-Fold
k = 10

# Initialize results DataFrame
results_df = pd.DataFrame(columns=['File', 'Model', 'Validation_Method', 'Accuracy', 'Feature_Importance'])

# Get total file count for progress tracking
total_files = len([file for file in os.listdir(main_dir) if file.endswith('.csv')])
processed_files = 0

# Loop over each file in main directory
for file_name in os.listdir(main_dir):
    if file_name.endswith('.csv'):
        # Print directory information for tracking
        print(f"\n{'='*40}")
        print(f"Main Directory File: {os.path.join(main_dir, file_name)}")
        
        # Load data
        file_path = os.path.join(main_dir, file_name)
        data = pd.read_csv(file_path)
        
        # Skip files with fewer than 10 rows
        if len(data) < 10:
            print(f"Skipping {file_name} as it has fewer than 10 rows.")
            processed_files += 1
            continue
        
        X = data.drop(['Cell Name'], axis=1)
        y = data['P/F']
        
        print(f"\nProcessing file: {file_name}\n")
        
        # K-Fold Cross-Validation
        print("K-Fold Cross-Validation Results:")
        kfold = KFold(n_splits=k, shuffle=True, random_state=42)
        
        for model_name, model in models.items():
            print(f"\nModel: {model_name} with K-Fold")
            kfold_accuracies = []

            for train_idx, test_idx in kfold.split(X):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                kfold_accuracies.append(accuracy)
                
            avg_kfold_accuracy = sum(kfold_accuracies) / k
            perm_importances_kfold = permutation_importance(model, X, y, n_repeats=10, random_state=42).importances_mean
            
            print(f"Average K-Fold Accuracy for {model_name}: {avg_kfold_accuracy}")

            results_df = pd.concat([results_df, pd.DataFrame([{
                'File': file_name,
                'Model': model_name,
                'Validation_Method': 'KFold',
                'Accuracy': avg_kfold_accuracy,
                'Feature_Importance': perm_importances_kfold
            }])], ignore_index=True)
        
        # LOOCV
        print("\nLeave-One-Out Cross-Validation Results:")
        loo = LeaveOneOut()
        
        for model_name, model in models.items():
            print(f"\nModel: {model_name} with LOOCV")
            loo_accuracies = []

            for train_idx, test_idx in loo.split(X):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                loo_accuracies.append(accuracy)
                
            # Print LOOCV average accuracy
            avg_loo_accuracy = sum(loo_accuracies) / len(loo_accuracies)
            perm_importances_loo = permutation_importance(model, X, y, n_repeats=10, random_state=42).importances_mean
            print(f"Average LOOCV Accuracy for {model_name}: {avg_loo_accuracy}")
            
            # Append LOOCV results to DataFrame
            results_df = pd.concat([results_df, pd.DataFrame([{
                'File': file_name,
                'Model': model_name,
                'Validation_Method': 'LOOCV',
                'Accuracy': avg_loo_accuracy,
                'Feature_Importance': perm_importances_loo
            }])], ignore_index=True)
        
        # Load prediction data from secondary directory
        secondary_file_path = os.path.join(secondary_dir, file_name)
        print(f"\nSecondary Directory File for Prediction: {secondary_file_path}")
        
        if os.path.exists(secondary_file_path):
            prediction_data = pd.read_csv(secondary_file_path)
            X_pred = prediction_data.drop(['Cell Name'], axis=1)
            y_pred_actual = prediction_data['P/F']
            
            # Make predictions for both K-Fold and LOOCV models
            print("\nPrediction Results:")
            for model_name, model in models.items():
                print(f"\nModel: {model_name} Predictions")
                
                # Train on full dataset for prediction testing
                model.fit(X, y)

                # kFold Trained Model Prediction
                y_pred_kfold = model.predict(X_pred)
                kfold_prediction_accuracy = accuracy_score(y_pred_actual, y_pred_kfold)
                print(f"K-Fold Prediction Accuracy for {model_name}: {kfold_prediction_accuracy}")
                
                # Predictions for LOOCV trained model
                y_pred_loocv = model.predict(X_pred)
                loocv_prediction_accuracy = accuracy_score(y_pred_actual, y_pred_loocv)
                print(f"LOOCV Prediction Accuracy for {model_name}: {loocv_prediction_accuracy}")

                
                # Print Classification Reports
                print("\nClassification Report (K-Fold):")
                print(classification_report(y_pred_actual, y_pred_kfold))
                
                print("\nClassification Report (LOOCV):")
                print(classification_report(y_pred_actual, y_pred_loocv))

                
                # Append prediction results to DataFrame
                results_df = pd.concat([results_df, pd.DataFrame([{
                    'File': file_name,
                    'Model': model_name,
                    'Validation_Method': 'kFold Prediction',
                    'Accuracy': kfold_prediction_accuracy
                }])], ignore_index=True)
                
                results_df = pd.concat([results_df, pd.DataFrame([{
                    'File': file_name,
                    'Model': model_name,
                    'Validation_Method': 'LOOCV Prediction',
                    'Accuracy': loocv_prediction_accuracy
                }])], ignore_index=True)
        
        # Update and print progress
        processed_files += 1
        progress = (processed_files / total_files) * 100
        print(f"\nProgress: {progress:.2f}%")

    print("\nIteration complete. Moving to the next file...\n")

# Save all results to a CSV file
results_df.to_csv(output_file, index=False)
print("\nAll results have been saved to:", output_file)


Main Directory File: C:\Users\santiago1\Documents\Santiago\Project\Datasets\55_Datasets (Frequency)\Training-Testing\1.0_Hz_df.csv

Processing file: 1.0_Hz_df.csv

K-Fold Cross-Validation Results:

Model: Random Forest with K-Fold
Average K-Fold Accuracy for Random Forest: 1.0

Model: Gradient Boosting with K-Fold


  results_df = pd.concat([results_df, pd.DataFrame([{


Average K-Fold Accuracy for Gradient Boosting: 1.0

Leave-One-Out Cross-Validation Results:

Model: Random Forest with LOOCV
Average LOOCV Accuracy for Random Forest: 1.0

Model: Gradient Boosting with LOOCV
Average LOOCV Accuracy for Gradient Boosting: 1.0

Secondary Directory File for Prediction: C:\Users\santiago1\Documents\Santiago\Project\Datasets\55_Datasets (Frequency)\Validation\1.0_Hz_df.csv

Progress: 3.23%

Iteration complete. Moving to the next file...


Main Directory File: C:\Users\santiago1\Documents\Santiago\Project\Datasets\55_Datasets (Frequency)\Training-Testing\1.25_Hz_df.csv

Processing file: 1.25_Hz_df.csv

K-Fold Cross-Validation Results:

Model: Random Forest with K-Fold
Average K-Fold Accuracy for Random Forest: 1.0

Model: Gradient Boosting with K-Fold
Average K-Fold Accuracy for Gradient Boosting: 1.0

Leave-One-Out Cross-Validation Results:

Model: Random Forest with LOOCV
Average LOOCV Accuracy for Random Forest: 1.0

Model: Gradient Boosting with LOOCV
Ave

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\santiago1\\Documents\\Santiago\\Project\\Datasets\\results_summary.csv'