In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

In [2]:
def train_random_forest_classifier(df, include_ratios=False, output_dir='plots', cv_folds=5):
    """
    Trains a Random Forest classification model on all data to predict whether an instance is timeout (censored=1)
    or completed (censored=0). Evaluates performance using accuracy, precision, recall, F1-score, and confusion matrix.

    Parameters:
    - df (pandas.DataFrame): Preprocessed DataFrame with solver features and censored column.
    - include_ratios (bool): If True, includes ratio features; if False, excludes them (default: False).
    - output_dir (str): Directory to save predictions and feature importance (default: 'plots').
    - cv_folds (int): Number of cross-validation folds (default: 5).

    Returns:
    - dict: Contains classification metrics, cross-validation accuracy, feature importance, and predictions.
    """
    try:
        # Validate input
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"Input 'df' must be a pandas DataFrame, got {type(df)}")
        print(f"Input DataFrame type: {type(df)}")
        # print(f"Input columns: {df.columns.tolist()}")
        
        # Feature selection
        exclude_cols = ['filename', 'final_expandEvts', 'stop_iter', 'final_maxStackDepth', 'censored']
        if not include_ratios:
            exclude_cols.extend([f'expandEvts_ratio_{i}' for i in range(1, 6)])
            exclude_cols.extend([f'pruneBacktrackEvts_ratio_{i}' for i in range(1, 6)])
        features = [col for col in df.columns if col not in exclude_cols]
        print(f"Selected features ({len(features)}): {features}")
        
        # Print class distribution
        print("\nClass Distribution (censored):")
        class_counts = df['censored'].value_counts()
        print(f"Completed (censored=0): {class_counts.get(0, 0)} instances")
        print(f"Timeout (censored=1): {class_counts.get(1, 0)} instances")
        
        # Prepare data
        X = df[features]
        y = df['censored'].astype(int)  # Ensure binary labels (0 or 1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Train Random Forest Classifier
        print("\nTraining Random Forest Classifier on all data...")
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_classifier.fit(X_train, y_train)
        
        # Evaluate on test set
        y_pred = rf_classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=1)
        recall = recall_score(y_test, y_pred, pos_label=1)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        
        print(f"Test Set Accuracy: {accuracy:.4f}")
        print(f"Test Set Precision (timeout): {precision:.4f}")
        print(f"Test Set Recall (timeout): {recall:.4f}")
        print(f"Test Set F1-Score (timeout): {f1:.4f}")
        print("\nConfusion Matrix:")
        print(f"[[True Neg (Completed)={cm[0,0]}, False Pos={cm[0,1]}]")
        print(f"[[False Neg={cm[1,0]}, True Pos (Timeout)={cm[1,1]}]")
        
        # Cross-validation
        print("\nPerforming cross-validation...")
        cv_scores = cross_val_score(rf_classifier, X, y, cv=cv_folds, scoring='accuracy')
        cv_accuracy = cv_scores.mean()
        print(f"Cross-Validation Accuracy: {cv_accuracy:.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        # Feature importance
        importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_classifier.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nFeature Importance (Top 10):")
        print(importance.head(10))
        
        # Save predictions and feature importance
        os.makedirs(output_dir, exist_ok=True)
        pd.DataFrame({
            'y_test': y_test,
            'y_pred': y_pred
        }).to_csv(os.path.join(output_dir, 'predictions_classifier_test.csv'), index=False)
        importance.to_csv(os.path.join(output_dir, 'feature_importance_classifier.csv'), index=False)
        print(f"Saved test set predictions to {output_dir}/predictions_classifier_test.csv")
        print(f"Saved feature importance to {output_dir}/feature_importance_classifier.csv")
        
        # Predict on all data for completeness
        y_pred_all = rf_classifier.predict(X)
        pd.DataFrame({
            'y_test': y,
            'y_pred': y_pred_all
        }).to_csv(os.path.join(output_dir, 'predictions_classifier_all.csv'), index=False)
        print(f"Saved all-data predictions to {output_dir}/predictions_classifier_all.csv")
        
        return {
            'accuracy': accuracy,
            'precision_timeout': precision,
            'recall_timeout': recall,
            'f1_timeout': f1,
            'confusion_matrix': cm,
            'cv_accuracy': cv_accuracy,
            'cv_accuracy_std': cv_scores.std(),
            'feature_importance': importance,
            # 'y_test': y_test,
            # 'y_pred': y_pred,
            # 'y_test_all': y,
            # 'y_pred_all': y_pred_all
        }
    
    except Exception as e:
        print(f"Error in train_random_forest_classifier: {e}")
        return None

df = pd.read_excel("structured_data.xlsx")
train_random_forest_classifier(df)

Input DataFrame type: <class 'pandas.core.frame.DataFrame'>
Selected features (60): ['num_stackdepth3_logs', 'evts_1', 'expandEvts_1', 'pruneBacktrackEvts_1', 'backtrackEvts_1', 'strengthenEvts_1', 'maxStackDepth_1', 'evts_2', 'expandEvts_2', 'pruneBacktrackEvts_2', 'backtrackEvts_2', 'strengthenEvts_2', 'maxStackDepth_2', 'evts_3', 'expandEvts_3', 'pruneBacktrackEvts_3', 'backtrackEvts_3', 'strengthenEvts_3', 'maxStackDepth_3', 'avg_evts', 'max_evts', 'avg_expandEvts', 'max_expandEvts', 'avg_pruneBacktrackEvts', 'max_pruneBacktrackEvts', 'evts_4', 'expandEvts_4', 'pruneBacktrackEvts_4', 'backtrackEvts_4', 'strengthenEvts_4', 'maxStackDepth_4', 'evts_5', 'expandEvts_5', 'pruneBacktrackEvts_5', 'backtrackEvts_5', 'strengthenEvts_5', 'maxStackDepth_5', 'n', 'k', 'total_sum', 'variance', 'skewness', 'max_num', 'min_num', 'avg_subset_sum', 'max_to_avg_ratio', 'range_to_avg_ratio', 'coef_of_variation', 'diff_evts_2', 'diff_expandEvts_2', 'diff_pruneBacktrackEvts_2', 'diff_evts_3', 'diff_exp

{'accuracy': 0.9855072463768116,
 'precision_timeout': 1.0,
 'recall_timeout': 0.9736842105263158,
 'f1_timeout': 0.9866666666666667,
 'confusion_matrix': array([[62,  0],
        [ 2, 74]]),
 'cv_accuracy': 0.9695652173913043,
 'cv_accuracy_std': 0.0356179876157812,
 'feature_importance':                       feature  importance
 20                   max_evts    0.117263
 23     avg_pruneBacktrackEvts    0.116067
 19                   avg_evts    0.087929
 24     max_pruneBacktrackEvts    0.077664
 21             avg_expandEvts    0.068835
 7                      evts_2    0.064734
 48                diff_evts_2    0.057257
 22             max_expandEvts    0.050304
 50  diff_pruneBacktrackEvts_2    0.049256
 49          diff_expandEvts_2    0.047179
 8                expandEvts_2    0.039309
 9        pruneBacktrackEvts_2    0.037398
 37                          n    0.025702
 0        num_stackdepth3_logs    0.024127
 14               expandEvts_3    0.022217
 15       pruneBacktra