In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os

def train_random_forest_censored_train_all_validate(df, include_ratios=False, save_plots=False, output_dir='plots', cv_folds=5):
    """
    Trains a Random Forest model on censored=0 data to predict final_expandEvts and validates on all data
    to assess generalizability. Uses relative error metrics (MAPE, MdAPE, Normalized RMSE) and cross-validation.

    Parameters:
    - df (pandas.DataFrame): Input DataFrame with solver features, ratio features, and final_expandEvts.
    - include_ratios (bool): If True, includes ratio features; if False, excludes them (default: False).
    - save_plots (bool): If True, saves scatter and feature importance plots to output_dir.
    - output_dir (str): Directory to save plots and results (default: 'plots').
    - cv_folds (int): Number of cross-validation folds (default: 5).

    Returns:
    - dict: Contains relative error metrics, CV scores, feature importance, and predictions for censored=0 test set and all data.
    """
    try:
        # Validate input
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"Input 'df' must be a pandas DataFrame, got {type(df)}")
        print(f"Input DataFrame type: {type(df)}")
        print(f"Input columns: {df.columns.tolist()}")
        
        # Create a copy to avoid modifying the input DataFrame
        df = df.copy()
        
        # Add log difference features
        print("Adding log difference features...")
        for i in range(2, 6):
            for feature in ['evts', 'expandEvts', 'pruneBacktrackEvts']:
                col_curr = f'{feature}_{i}'
                col_prev = f'{feature}_{i-1}'
                if col_curr in df and col_prev in df:
                    df[f'diff_{feature}_{i}'] = df[col_curr] - df[col_prev]
                    df[f'diff_{feature}_{i}'] = df[f'diff_{feature}_{i}'].fillna(0)
        print(f"Columns after log difference features: {df.columns.tolist()}")
        
        # Ensure missing log features are imputed
        print("Imputing missing log features...")
        for i in range(1, 6):
            for feature in ['evts', 'expandEvts', 'pruneBacktrackEvts', 'backtrackEvts', 'strengthenEvts', 'maxStackDepth']:
                col = f'{feature}_{i}'
                if col in df:
                    df.loc[df['k'] == 3, col] = df.loc[df['k'] == 3, col].fillna(0)
                    df.loc[df['num_stackdepth3_logs'] < i, col] = df.loc[df['num_stackdepth3_logs'] < i, col].fillna(0)
        print(f"Columns after imputation: {df.columns.tolist()}")
        
        # Feature selection
        exclude_cols = ['filename', 'final_expandEvts', 'stop_iter', 'final_maxStackDepth']
        if not include_ratios:
            exclude_cols.extend([f'expandEvts_ratio_{i}' for i in range(1, 6)])
            exclude_cols.extend([f'pruneBacktrackEvts_ratio_{i}' for i in range(1, 6)])
        features = [col for col in df.columns if col not in exclude_cols]
        print(f"Selected features ({len(features)}): {features}")
        
        # Print target statistics
        print("\nTarget Statistics (final_expandEvts):")
        mean_censored = df[df['censored'] == 0]['final_expandEvts'].mean()
        std_censored = df[df['censored'] == 0]['final_expandEvts'].std()
        mean_all = df['final_expandEvts'].mean()
        std_all = df['final_expandEvts'].std()
        print(f"Mean (censored=0): {mean_censored:.2f}")
        print(f"Std Dev (censored=0): {std_censored:.2f}")
        print(f"Mean (all data): {mean_all:.2f}")
        print(f"Std Dev (all data): {std_all:.2f}")
        
        # Calculate relative error metrics
        def calculate_mape(y_true, y_pred):
            mask = y_true != 0
            return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if mask.sum() > 0 else np.nan
        
        def calculate_mdape(y_true, y_pred):
            mask = y_true != 0
            return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if mask.sum() > 0 else np.nan
        
        def calculate_normalized_rmse(y_true, y_pred, mean_y):
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            return rmse / mean_y if mean_y != 0 else np.nan
        
        # Train on censored=0 data
        print("\nTraining Random Forest on censored=0 data...")
        df_censored = df[df['censored'] == 0]
        if df_censored.empty:
            print("Warning: No censored=0 instances found. Exiting.")
            return None
        
        X_censored = df_censored[features]
        y_censored = df_censored['final_expandEvts']
        X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_censored, y_censored, test_size=0.2, random_state=42)
        
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train_c, y_train_c)
        
        # Evaluate on censored=0 test set
        y_pred_c = rf_model.predict(X_test_c)
        rmse_censored = np.sqrt(mean_squared_error(y_test_c, y_pred_c))
        norm_rmse_censored = calculate_normalized_rmse(y_test_c, y_pred_c, mean_censored)
        mape_censored = calculate_mape(y_test_c, y_pred_c)
        mdape_censored = calculate_mdape(y_test_c, y_pred_c)
        cv_scores = cross_val_score(rf_model, X_censored, y_censored, cv=cv_folds, scoring='neg_root_mean_squared_error')
        cv_norm_rmse_censored = (-cv_scores.mean()) / mean_censored if mean_censored != 0 else np.nan
        print(f"Censored=0 Test Set RMSE: {rmse_censored:.4f}")
        print(f"Censored=0 Test Set Normalized RMSE: {norm_rmse_censored:.4%}")
        print(f"Censored=0 Test Set MAPE: {mape_censored:.2f}%")
        print(f"Censored=0 Test Set MdAPE: {mdape_censored:.2f}%")
        print(f"Censored=0 CV Normalized RMSE: {cv_norm_rmse_censored:.4%}")
        
        importance_censored = pd.DataFrame({
            'feature': X_censored.columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nCensored=0 Feature Importance (Top 10):")
        print(importance_censored.head(10))
        
        # Validate on all data
        print("\nValidating on all data...")
        X_all = df[features]
        y_all = df['final_expandEvts']
        y_pred_all = rf_model.predict(X_all)
        rmse_all = np.sqrt(mean_squared_error(y_all, y_pred_all))
        norm_rmse_all = calculate_normalized_rmse(y_all, y_pred_all, mean_all)
        mape_all = calculate_mape(y_all, y_pred_all)
        mdape_all = calculate_mdape(y_all, y_pred_all)
        print(f"All Data RMSE: {rmse_all:.4f}")
        print(f"All Data Normalized RMSE: {norm_rmse_all:.4%}")
        print(f"All Data MAPE: {mape_all:.2f}%")
        print(f"All Data MdAPE: {mdape_all:.2f}%")
        
        # Save predictions and feature importance
        os.makedirs(output_dir, exist_ok=True)
        pd.DataFrame({
            'y_test': y_test_c,
            'y_pred': y_pred_c
        }).to_csv(os.path.join(output_dir, 'predictions_censored_test.csv'), index=False)
        pd.DataFrame({
            'y_test': y_all,
            'y_pred': y_pred_all
        }).to_csv(os.path.join(output_dir, 'predictions_all.csv'), index=False)
        importance_censored.to_csv(os.path.join(output_dir, 'feature_importance_censored.csv'), index=False)
        print(f"Saved censored=0 test predictions to {output_dir}/predictions_censored_test.csv")
        print(f"Saved all-data predictions to {output_dir}/predictions_all.csv")
        print(f"Saved feature importance to {output_dir}/feature_importance_censored.csv")
        
       
        
        return {
            'rmse_censored_test': rmse_censored,
            'norm_rmse_censored_test': norm_rmse_censored,
            'mape_censored_test': mape_censored,
            'mdape_censored_test': mdape_censored,
            'cv_norm_rmse_censored': cv_norm_rmse_censored,
            'rmse_all': rmse_all,
            'norm_rmse_all': norm_rmse_all,
            'mape_all': mape_all,
            'mdape_all': mdape_all,
            'feature_importance': importance_censored,
            'y_test_censored': y_test_c,
            'y_pred_censored': y_pred_c,
            'y_test_all': y_all,
            'y_pred_all': y_pred_all
        }
    
    except Exception as e:
        print(f"Error in train_random_forest_censored_train_all_validate: {e}")
        return None

In [6]:
df = pd.read_excel("structured_data.xlsx")
results = train_random_forest_censored_train_all_validate(df)
results

Input DataFrame type: <class 'pandas.core.frame.DataFrame'>
Input columns: ['filename', 'num_stackdepth3_logs', 'evts_1', 'expandEvts_1', 'pruneBacktrackEvts_1', 'backtrackEvts_1', 'strengthenEvts_1', 'maxStackDepth_1', 'evts_2', 'expandEvts_2', 'pruneBacktrackEvts_2', 'backtrackEvts_2', 'strengthenEvts_2', 'maxStackDepth_2', 'evts_3', 'expandEvts_3', 'pruneBacktrackEvts_3', 'backtrackEvts_3', 'strengthenEvts_3', 'maxStackDepth_3', 'censored', 'final_expandEvts', 'final_maxStackDepth', 'stop_iter', 'avg_evts', 'max_evts', 'avg_expandEvts', 'max_expandEvts', 'avg_pruneBacktrackEvts', 'max_pruneBacktrackEvts', 'evts_4', 'expandEvts_4', 'pruneBacktrackEvts_4', 'backtrackEvts_4', 'strengthenEvts_4', 'maxStackDepth_4', 'evts_5', 'expandEvts_5', 'pruneBacktrackEvts_5', 'backtrackEvts_5', 'strengthenEvts_5', 'maxStackDepth_5', 'n', 'k', 'total_sum', 'variance', 'skewness', 'max_num', 'min_num', 'avg_subset_sum', 'max_to_avg_ratio', 'range_to_avg_ratio', 'coef_of_variation', 'expandEvts_ratio_

{'rmse_censored_test': 66180820.15411146,
 'norm_rmse_censored_test': 1.2433506574572482,
 'mape_censored_test': 21.988389124038925,
 'mdape_censored_test': 19.561849377825446,
 'cv_norm_rmse_censored': 1.4217348699589045,
 'rmse_all': 276647912.3957079,
 'norm_rmse_all': 0.3634576272452347,
 'mape_all': 19.772928858500563,
 'mdape_all': 20.423839422148387,
 'feature_importance':                    feature  importance
 24  avg_pruneBacktrackEvts    0.290642
 22          avg_expandEvts    0.174435
 20                avg_evts    0.152380
 25  max_pruneBacktrackEvts    0.094161
 21                max_evts    0.061632
 ..                     ...         ...
 5         strengthenEvts_1    0.000000
 4          backtrackEvts_1    0.000000
 3     pruneBacktrackEvts_1    0.000000
 1                   evts_1    0.000000
 2             expandEvts_1    0.000000
 
 [61 rows x 2 columns],
 'y_test_censored': 672             9
 656             7
 223         23367
 9             713
 77      48302432