In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams
import seaborn as sns
import matplotlib.font_manager as fm
from scipy import stats
from scipy.stats import linregress
import re
import os

def extract_level_number(level_str):
    """Extract numeric level from level string - improved version"""
    import re
    
    if pd.isna(level_str):
        return None
    
    # Convert to string if not already
    level_str = str(level_str).lower().strip()
    
    # Try different patterns
    patterns = [
        r'level\s*(\d+)',  # level0, level 0, level_0
        r'lvl\s*(\d+)',    # lvl0, lvl 0
        r'^(\d+)$',        # just a number
        r'(\d+)',          # any number in the string
    ]
    
    for pattern in patterns:
        match = re.search(pattern, level_str)
        if match:
            return int(match.group(1))
    
    return None

def parse_value_with_error(value_str):
    """
    Parse a string like '15.59 ± 0.94' and return (value, error)
    If no error is found, return (value, 0.0)
    """
    if pd.isna(value_str):
        return None, None
    
    value_str = str(value_str).strip()
    
    # Look for pattern: number ± number
    pattern = r'(-?\d+\.?\d*)\s*[±]\s*(\d+\.?\d*)'
    match = re.search(pattern, value_str)
    
    if match:
        value = float(match.group(1))
        error = float(match.group(2))
        return value, error
    else:
        # Try to extract just the number if no ± found
        number_pattern = r'(-?\d+\.?\d*)'
        number_match = re.search(number_pattern, value_str)
        if number_match:
            value = float(number_match.group(1))
            return value, 0.0
        else:
            return None, None

def create_merged_lfer_plots(csv_path, save_paths=None, error_scale_factor=0.3):
    """
    Create merged LFER plots: one with error bars and one without, with single R²
    
    Parameters:
    csv_path (str): Path to the CSV file
    save_paths (list): List of directory paths where the plot should be saved
    error_scale_factor (float): Scale factor for error bars
    """
    
    # Load data
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows from CSV")
    
    # Show all column names to help debug
    print(f"\nAll columns in CSV:")
    for i, col in enumerate(df.columns):
        print(f"  {i}: '{col}'")
    
    # Check for raw data columns that might contain error information
    raw_columns = [col for col in df.columns if 'raw' in col.lower()]
    print(f"\nFound raw data columns: {raw_columns}")
    
    # Parse raw data columns to extract values and errors
    if 'dG_star_raw' in df.columns:
        print("Parsing dG_star_raw column...")
        df[['dg_star_parsed', 'dg_star_stderr']] = df['dG_star_raw'].apply(
            lambda x: pd.Series(parse_value_with_error(x))
        )
    
    if 'dG0_raw' in df.columns:
        print("\nParsing dG0_raw column...")
        df[['dg0_parsed', 'dg0_stderr']] = df['dG0_raw'].apply(
            lambda x: pd.Series(parse_value_with_error(x))
        )
    
    # Standardize column names - now prioritize parsed values
    column_mapping = {
        'Mean_dG_star': 'dg_star_mean',
        'Mean_dG0': 'dg0_mean', 
        'Mutation': 'mutation',
        'mean_dg_star': 'dg_star_mean',
        'mean_dg0': 'dg0_mean',
    }
    
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns:
            print(f"Renaming: '{old_name}' -> '{new_name}'")
            df = df.rename(columns={old_name: new_name})
    
    # Decide which values to use - prefer parsed values from raw data
    if 'dg_star_parsed' in df.columns and not df['dg_star_parsed'].isna().all():
        df['dg_star'] = df['dg_star_parsed']
        print("Using parsed dG_star values from raw data")
    elif 'dg_star_mean' in df.columns:
        df['dg_star'] = df['dg_star_mean']
        print("Using mean dG_star values")
    else:
        print("ERROR: No dG_star column found!")
        return None, None, None
    
    if 'dg0_parsed' in df.columns and not df['dg0_parsed'].isna().all():
        df['dg0'] = df['dg0_parsed']
        print("Using parsed dG0 values from raw data")
    elif 'dg0_mean' in df.columns:
        df['dg0'] = df['dg0_mean']
        print("Using mean dG0 values")
    else:
        print("ERROR: No dG0 column found!")
        return None, None, None
    
    # Check for error columns
    has_dg_star_error = 'dg_star_stderr' in df.columns and not df['dg_star_stderr'].isna().all()
    has_dg0_error = 'dg0_stderr' in df.columns and not df['dg0_stderr'].isna().all()
    
    print(f"\nError bar availability:")
    print(f"  ΔG‡ standard error: {'Available' if has_dg_star_error else 'Not found'}")
    print(f"  ΔG° standard error: {'Available' if has_dg0_error else 'Not found'}")
    
    # Extract level numbers
    df['Level_Numeric'] = df['Level'].apply(extract_level_number)
    
    # Clean data - include error columns if they exist
    required_cols = ['dg_star', 'dg0', 'mutation', 'Level_Numeric']
    df_clean = df.dropna(subset=required_cols).copy()
    
    # Handle missing error values by setting them to 0
    if has_dg_star_error:
        df_clean['dg_star_stderr'] = df_clean['dg_star_stderr'].fillna(0)
    else:
        df_clean['dg_star_stderr'] = 0
        
    if has_dg0_error:
        df_clean['dg0_stderr'] = df_clean['dg0_stderr'].fillna(0)
    else:
        df_clean['dg0_stderr'] = 0
    
    # Apply error scaling factor
    if has_dg_star_error:
        df_clean['dg_star_stderr_scaled'] = df_clean['dg_star_stderr'] * error_scale_factor
    else:
        df_clean['dg_star_stderr_scaled'] = 0
        
    # No horizontal error bars - only vertical
    df_clean['dg0_stderr_scaled'] = 0
    
    print(f"After cleaning: {len(df_clean)} rows remaining")
    
    # Get unique levels that actually exist
    unique_levels = sorted(df_clean['Level_Numeric'].unique())
    print(f"Numeric levels found: {unique_levels}")
    
    # Define distinct colors for levels
    level_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
                   '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    
    n_levels = len(unique_levels)
    if n_levels == 1:
        level_alphas = [0.7]
        level_sizes = [100]
        level_colors_mapped = [level_colors[0]]
    else:
        level_alphas = np.linspace(0.6, 0.9, n_levels)
        level_sizes = np.linspace(80, 120, n_levels)
        level_colors_mapped = [level_colors[i % len(level_colors)] for i in range(n_levels)]
    
    # Create level to visual mapping
    level_visual_map = {}
    for i, level in enumerate(unique_levels):
        level_visual_map[level] = {
            'alpha': level_alphas[i],
            'size': level_sizes[i],
            'color': level_colors_mapped[i]
        }
    
    # Calculate SINGLE R² for all data (merged)
    slope, intercept, r_value, p_value, std_err = linregress(df_clean['dg0'], df_clean['dg_star'])
    r_squared = r_value**2
    
    # Create merged subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8), facecolor='white')
    
    # Plot 1: With error bars
    for level in unique_levels:
        level_data = df_clean[df_clean['Level_Numeric'] == level]
        
        # Check if we have meaningful error values
        has_meaningful_dg_star_err = (level_data['dg_star_stderr_scaled'] > 0).any()
        
        # Plot error bars first (behind points)
        if has_meaningful_dg_star_err:
            yerr = level_data['dg_star_stderr_scaled']
            xerr = None  # No horizontal error bars
            
            ax1.errorbar(level_data['dg0'], level_data['dg_star'], 
                       xerr=xerr, yerr=yerr,
                       fmt='none',
                       ecolor=level_visual_map[level]['color'], 
                       alpha=0.5,
                       capsize=2,
                       capthick=1,
                       elinewidth=1,
                       zorder=2)
        
        # Plot the actual data points on top
        ax1.scatter(level_data['dg0'], level_data['dg_star'], 
                   c=level_visual_map[level]['color'], 
                   s=level_visual_map[level]['size'],
                   alpha=level_visual_map[level]['alpha'],
                   marker='o',
                   edgecolors='black', linewidth=0.8,
                   label=f'Level {level}',
                   zorder=5)
    
    # Plot trend line for error bar plot
    x_range = np.linspace(df_clean['dg0'].min(), df_clean['dg0'].max(), 100)
    y_trend = slope * x_range + intercept
    ax1.plot(x_range, y_trend, 'r-', linewidth=2.5, alpha=0.9, zorder=3)
    
    # Add R² to error bar plot
    ax1.text(0.05, 0.95, f'R² = {r_squared:.3f}', transform=ax1.transAxes, 
            verticalalignment='top', fontsize=14, 
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
    
    ax1.set_xlabel('ΔG° (kcal/mol)', fontweight='bold', fontsize=16)
    ax1.set_ylabel('ΔG‡ (kcal/mol)', fontweight='bold', fontsize=16)
    ax1.set_title('LFER Plot with Error Bars', fontweight='bold', fontsize=18)
    ax1.grid(True, alpha=0.3)
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    ax1.tick_params(axis='both', which='major', labelsize=14)
    
    # Plot 2: Without error bars (simple scatter)
    for level in unique_levels:
        level_data = df_clean[df_clean['Level_Numeric'] == level]
        
        ax2.scatter(level_data['dg0'], level_data['dg_star'], 
                   c=level_visual_map[level]['color'], 
                   s=level_visual_map[level]['size'],
                   alpha=level_visual_map[level]['alpha'],
                   marker='o',
                   edgecolors='black', linewidth=0.8,
                   label=f'Level {level}',
                   zorder=5)
    
    # Plot the SAME trend line for scatter plot (same R²)
    ax2.plot(x_range, y_trend, 'r-', linewidth=2.5, alpha=0.9, zorder=3)
    
    # Add the SAME R² to scatter plot
    ax2.text(0.05, 0.95, f'R² = {r_squared:.3f}', transform=ax2.transAxes, 
            verticalalignment='top', fontsize=14, 
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
    
    ax2.set_xlabel('ΔG° (kcal/mol)', fontweight='bold', fontsize=16)
    ax2.set_ylabel('ΔG‡ (kcal/mol)', fontweight='bold', fontsize=16)
    ax2.set_title('LFER Plot (Scatter Only)', fontweight='bold', fontsize=18)
    ax2.grid(True, alpha=0.3)
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    ax2.tick_params(axis='both', which='major', labelsize=14)
    
    # Add overall title with single R²
    plt.suptitle(f'Linear Free Energy Relationship - Combined R² = {r_squared:.3f}', 
                fontsize=20, fontweight='bold', y=0.98)
    
    plt.tight_layout()
    
    # Save the plots if save_paths are provided
    if save_paths:
        for save_path in save_paths:
            # Create directory if it doesn't exist
            os.makedirs(save_path, exist_ok=True)
            
            # Define filenames
            png_filename = "merged_LFER_plots.png"
            pdf_filename = "merged_LFER_plots.pdf"
            
            full_png_path = os.path.join(save_path, png_filename)
            full_pdf_path = os.path.join(save_path, pdf_filename)
            
            # Save with high quality
            fig.savefig(full_png_path, dpi=300, bbox_inches='tight', 
                       facecolor='white', edgecolor='none')
            fig.savefig(full_pdf_path, dpi=300, bbox_inches='tight', 
                       facecolor='white', edgecolor='none')
            
            print(f"Merged plots saved to: {full_png_path}")
            print(f"PDF saved to: {full_pdf_path}")
    
    plt.show()
    
    # Print comprehensive summary
    print(f"\n--- MERGED PLOTS SUMMARY ---")
    print(f"Single R² value calculated from all data: {r_squared:.3f}")
    print(f"Levels in your data: {unique_levels}")
    print(f"Total data points: {len(df_clean)}")
    print(f"Linear regression equation: ΔG‡ = {slope:.3f} × ΔG° + {intercept:.3f}")
    print(f"p-value: {p_value:.6f}")
    
    # Error bar summary
    print(f"\n--- ERROR BAR CONFIGURATION ---")
    print(f"Left plot: WITH error bars (vertical only)")
    print(f"Right plot: WITHOUT error bars (simple scatter)")
    print(f"Both plots share the SAME R² value: {r_squared:.3f}")
    
    return fig, (ax1, ax2), df_clean

# Example usage with merged plots:
if __name__ == "__main__":
    csv_path = "/home/hp/results/MOUSE/distance_analysis_data.csv"
    save_paths = [
        "/home/hp/nayanika/github/GPX6/figures",
        "/home/hp/nayanika/github/Article-GPX6-EVB/Figures"
    ]
    
    # Create merged plots with single R²
    print("Creating merged LFER plots with single R²...")
    fig, axes, data = create_merged_lfer_plots(csv_path, save_paths=save_paths, 
                                              error_scale_factor=0.3)

Creating merged LFER plots with single R²...


FileNotFoundError: [Errno 2] No such file or directory: '/home/hp/results/MOUSE/distance_analysis_data.csv'