In [15]:
import pandas as pd
import re
import os

def parse_targeted_files_corrected():
    """Corrected parser with accurate metric extraction"""
    all_results = []
    
    # Look for files with your naming pattern
    for filename in os.listdir('../results_txt'):
        if any(x in filename for x in ['n=', 'pi1=', 'pi2=']) or filename.endswith('.log'):
            try:
                with open(filename, 'r') as f:
                    content = f.read()
                
                # Extract parameters
                n = re.search(r'n=(\d+)', content)
                pi1 = re.search(r'pi1=([\d.]+)', content)
                pi2 = re.search(r'pi2=([\d.]+)', content)
                
                if n and pi1 and pi2:
                    unique_id = f"n={n.group(1)}_pi1={pi1.group(1)}_pi2={pi2.group(1)}"
                    
                    # Parse metrics for each model
                    models_data = parse_all_models_corrected(content, unique_id, n.group(1), pi1.group(1), pi2.group(1), filename)
                    all_results.extend(models_data)
                    
                    print(f"Processed: {filename}")
                    
            except Exception as e:
                print(f"Error with {filename}: {e}")
    
    # Create and save summary
    if all_results:
        df = pd.DataFrame(all_results)
        
        # Reorder columns for better readability
        column_order = ['unique_id', 'n', 'pi1', 'pi2', 'model', 'file', 
                       'rate_MSE', 'rate_MAE', 'rate_R2',
                       'count_MSE', 'count_MAE', 'count_R2', 'count_Accuracy']
        
        # Only include columns that actually exist in the data
        final_columns = [col for col in column_order if col in df.columns]
        df = df[final_columns]
        
        df.to_csv('summary.csv', index=False)
        print(f"\nSaved summary with {len(all_results)} model entries to summary.csv")
        
        # Display sample to verify no duplicates
        print("\nSample verification (showing first 2 experiments):")
        sample_df = df[df['unique_id'].isin(df['unique_id'].unique()[:2])]
        print(sample_df.to_string(index=False))
    else:
        print("No files processed!")

def parse_all_models_corrected(content, unique_id, n, pi1, pi2, filename):
    """Parse metrics for all models with corrected parsing"""
    models_data = []
    
    # Parse ZKIPModel_EM
    zkip_data = parse_zkp_model_corrected(content, unique_id, n, pi1, pi2, filename)
    if zkip_data:
        models_data.append(zkip_data)
    
    # Parse ZkICMP
    zkicmp_data = parse_zkicmp_model_corrected(content, unique_id, n, pi1, pi2, filename)
    if zkicmp_data:
        models_data.append(zkicmp_data)
    
    # Parse HistGradientBoostingRegressor
    hgb_data = parse_hgb_model_corrected(content, unique_id, n, pi1, pi2, filename)
    if hgb_data:
        models_data.append(hgb_data)
    
    # Parse PoissonRandomForestRegressor
    prf_data = parse_prf_model_corrected(content, unique_id, n, pi1, pi2, filename)
    if prf_data:
        models_data.append(prf_data)
    
    return models_data

def parse_zkp_model_corrected(content, unique_id, n, pi1, pi2, filename):
    """Parse ZKIPModel_EM metrics with corrected parsing"""
    if 'ZKIPModel_EM' not in content:
        return None
    
    model_data = {
        'unique_id': unique_id,
        'n': n,
        'pi1': pi1,
        'pi2': pi2,
        'model': 'ZKIPModel_EM',
        'file': filename
    }
    
    # Extract ZKIPModel_EM section
    zkip_section = re.search(r'ZKIPModel_EM.*?------------------------------(.*?)(?=------------------------------|$)', content, re.DOTALL)
    if zkip_section:
        section_content = zkip_section.group(1)
        
        # Expected value predictions
        expected_section = re.search(r'Expected value prediction:(.*?)Mode prediction:', section_content, re.DOTALL)
        if expected_section:
            expected_text = expected_section.group(1)
            model_data['rate_MSE'] = extract_metric_specific(expected_text, 'MSE')
            model_data['rate_MAE'] = extract_metric_specific(expected_text, 'MAE')
            model_data['rate_R2'] = extract_metric_specific(expected_text, 'r\^2')
        
        # Mode predictions
        mode_section = re.search(r'Mode prediction:(.*?)(?=Performance metrics:|$)', section_content, re.DOTALL)
        if mode_section:
            mode_text = mode_section.group(1)
            model_data['count_MSE'] = extract_metric_specific(mode_text, 'MSE')
            model_data['count_MAE'] = extract_metric_specific(mode_text, 'MAE')
            model_data['count_R2'] = extract_metric_specific(mode_text, 'r\^2')
            model_data['count_Accuracy'] = extract_metric_specific(mode_text, 'Accuracy')
    
    return model_data

def parse_zkicmp_model_corrected(content, unique_id, n, pi1, pi2, filename):
    """Parse ZkICMP metrics with corrected parsing"""
    if 'ZkICMP' not in content:
        return None
    
    model_data = {
        'unique_id': unique_id,
        'n': n,
        'pi1': pi1,
        'pi2': pi2,
        'model': 'ZkICMP',
        'file': filename
    }
    
    # Extract ZkICMP section
    zkicmp_section = re.search(r'ZkICMP.*?------------------------------(.*?)(?=------------------------------|$)', content, re.DOTALL)
    if zkicmp_section:
        section_content = zkicmp_section.group(1)
        
        # Rate predictions
        rate_match = re.search(r'Rate predictions.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?r\^2:\s*([\d.-]+)', section_content, re.DOTALL)
        if rate_match:
            model_data['rate_MSE'] = float(rate_match.group(1))
            model_data['rate_MAE'] = float(rate_match.group(2))
            model_data['rate_R2'] = float(rate_match.group(3))
        
        # Count predictions
        count_match = re.search(r'Count predictions:.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?r\^2:\s*([\d.-]+).*?Accuracy:\s*([\d.-]+)', section_content, re.DOTALL)
        if count_match:
            model_data['count_MSE'] = float(count_match.group(1))
            model_data['count_MAE'] = float(count_match.group(2))
            model_data['count_R2'] = float(count_match.group(3))
            model_data['count_Accuracy'] = float(count_match.group(4))
    
    return model_data

def parse_hgb_model_corrected(content, unique_id, n, pi1, pi2, filename):
    """Parse HistGradientBoostingRegressor metrics with corrected parsing"""
    if 'HistGradientBoostingRegressor' not in content:
        return None
    
    model_data = {
        'unique_id': unique_id,
        'n': n,
        'pi1': pi1,
        'pi2': pi2,
        'model': 'HistGradientBoostingRegressor',
        'file': filename
    }
    
    # Extract HistGradientBoostingRegressor section
    hgb_section = re.search(r'HistGradientBoostingRegressor.*?------------------------------(.*?)(?=------------------------------|$)', content, re.DOTALL)
    if hgb_section:
        section_content = hgb_section.group(1)
        
        # Rate predictions
        rate_match = re.search(r'Rate predictions.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?R²:\s*([\d.-]+)', section_content, re.DOTALL)
        if rate_match:
            model_data['rate_MSE'] = float(rate_match.group(1))
            model_data['rate_MAE'] = float(rate_match.group(2))
            model_data['rate_R2'] = float(rate_match.group(3))
        
        # Count predictions
        count_match = re.search(r'Count predictions.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?R²:\s*([\d.-]+).*?Accuracy:\s*([\d.-]+)', section_content, re.DOTALL)
        if count_match:
            model_data['count_MSE'] = float(count_match.group(1))
            model_data['count_MAE'] = float(count_match.group(2))
            model_data['count_R2'] = float(count_match.group(3))
            model_data['count_Accuracy'] = float(count_match.group(4))
    
    return model_data

def parse_prf_model_corrected(content, unique_id, n, pi1, pi2, filename):
    """Parse PoissonRandomForestRegressor metrics with corrected parsing"""
    if 'PoissonRandomForestRegressor' not in content:
        return None
    
    model_data = {
        'unique_id': unique_id,
        'n': n,
        'pi1': pi1,
        'pi2': pi2,
        'model': 'PoissonRandomForestRegressor',
        'file': filename
    }
    
    # Extract PoissonRandomForestRegressor section
    prf_section = re.search(r'PoissonRandomForestRegressor.*?------------------------------(.*?)(?=------------------------------|$)', content, re.DOTALL)
    if prf_section:
        section_content = prf_section.group(1)
        
        # Rate predictions
        rate_match = re.search(r'Rate predictions.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?R²:\s*([\d.-]+)', section_content, re.DOTALL)
        if rate_match:
            model_data['rate_MSE'] = float(rate_match.group(1))
            model_data['rate_MAE'] = float(rate_match.group(2))
            model_data['rate_R2'] = float(rate_match.group(3))
        
        # Count predictions
        count_match = re.search(r'Count predictions.*?MSE:\s*([\d.-]+).*?MAE:\s*([\d.-]+).*?R²:\s*([\d.-]+).*?Accuracy:\s*([\d.-]+)', section_content, re.DOTALL)
        if count_match:
            model_data['count_MSE'] = float(count_match.group(1))
            model_data['count_MAE'] = float(count_match.group(2))
            model_data['count_R2'] = float(count_match.group(3))
            model_data['count_Accuracy'] = float(count_match.group(4))
    
    return model_data

def extract_metric_specific(text, metric_name):
    """Extract a specific metric from text with better precision"""
    pattern = rf'{metric_name}:\s*([\d.-]+)'
    match = re.search(pattern, text)
    return float(match.group(1)) if match else None

if __name__ == "__main__":
    parse_targeted_files_corrected()

FileNotFoundError: [Errno 2] No such file or directory: '../results_txt'