In [1]:
# Text File Data Extraction to Pandas DataFrame

import pandas as pd
import re
import os
from pathlib import Path

# Define extraction patterns and column mappings
EXTRACTION_PATTERNS = [
    {
        'pattern': r"Target Class:\s*\[(.*?)\]",
        'column': 'Tumor type',
        'description': 'Extract tumor type from Target Class'
    },
    {
        'pattern': r"Best parameters: \{[^}]*'C': ([^,}]+)",
        'column': 'C',
        'description': 'Extract C parameter from Best parameters',
        'convert_type': float
    },
    {
        'pattern': r"Best parameters: \{[^}]*'l1_ratio': ([^,}]+)",
        'column': 'l1_ratio',
        'description': 'Extract l1_ratio from Best parameters',
        'convert_type': float
    },
    {
        'pattern': r"Number of significant proteins: (\d+)",
        'column': 'Proteins selected',
        'description': 'Extract number of significant proteins',
        'convert_type': int
    },
    {
        'pattern': r"Selected hyperparameter C: ([^\s;]+)",
        'column': 'Ridge HP',
        'description': 'Extract Ridge hyperparameter C',
        'convert_type': float
    },
    {
        'pattern': r"Train MCC Score: ([\d\.]+ ± [\d\.]+)",
        'column': 'MCC Train',
        'description': 'Extract Train MCC Score with standard deviation'
    },
    {
        'pattern': r"MCC test: ([\d\.]+)",
        'column': 'MCC Test',
        'description': 'Extract MCC test score',
        'convert_type': float
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'TN',
        'description': 'Extract True Negatives from confusion matrix',
        'convert_type': int,
        'group': 1  # First capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'FP',
        'description': 'Extract False Positives from confusion matrix',
        'convert_type': int,
        'group': 2  # Second capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'FN',
        'description': 'Extract False Negatives from confusion matrix',
        'convert_type': int,
        'group': 3  # Third capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'TP',
        'description': 'Extract True Positives from confusion matrix',
        'convert_type': int,
        'group': 4  # Fourth capture group
    }
]

def extract_data_from_text(text, patterns):
    """
    Extract data from text using defined patterns
    
    Args:
        text (str): Input text to process
        patterns (list): List of pattern dictionaries
    
    Returns:
        dict: Extracted data with column names as keys
    """
    extracted_data = {}
    
    for pattern_info in patterns:
        pattern = pattern_info['pattern']
        column = pattern_info['column']
        convert_type = pattern_info.get('convert_type', str)
        
        # In the extract_data_from_text function, modify the matching section:
        match = re.search(pattern, text)
        if match:
            group_num = pattern_info.get('group', 1)  # Default to group 1
            value = match.group(group_num).strip()
            
            # Convert type if specified
            if convert_type != str:
                try:
                    value = convert_type(value)
                except (ValueError, TypeError):
                    print(f"Warning: Could not convert '{value}' to {convert_type} for column '{column}'")
            
            extracted_data[column] = value
        else:
            extracted_data[column] = None
            print(f"Warning: Pattern for '{column}' not found in text")
    
    return extracted_data

def process_single_file(file_path, patterns):
    """
    Process a single text file and extract data
    
    Args:
        file_path (str): Path to the text file
        patterns (list): List of extraction patterns
    
    Returns:
        dict: Extracted data from the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        print(f"Processing file: {file_path}")
        extracted_data = extract_data_from_text(text, patterns)
        
        # Add filename for reference
        extracted_data['Source_File'] = os.path.basename(file_path)
        
        return extracted_data
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found")
        return None
    except Exception as e:
        print(f"Error processing file '{file_path}': {str(e)}")
        return None

def process_multiple_files(file_paths, patterns):
    """
    Process multiple text files and combine into DataFrame
    
    Args:
        file_paths (list): List of file paths to process
        patterns (list): List of extraction patterns
    
    Returns:
        pandas.DataFrame: Combined data from all files
    """
    all_data = []
    
    for file_path in file_paths:
        file_data = process_single_file(file_path, patterns)
        if file_data:
            all_data.append(file_data)
    
    if all_data:
        df = pd.DataFrame(all_data)
        return df
    else:
        print("No data extracted from any files")
        return pd.DataFrame()

# Example usage:

# Method 1: Process files from a directory
def process_files_from_directory(directory_path, file_extension='.txt'):
    """
    Process all files with specified extension from a directory
    """
    directory = Path(directory_path)
    if not directory.exists():
        print(f"Directory '{directory_path}' does not exist")
        return pd.DataFrame()
    
    file_paths = list(directory.glob(f'*{file_extension}'))
    if not file_paths:
        print(f"No {file_extension} files found in '{directory_path}'")
        return pd.DataFrame()
    
    print(f"Found {len(file_paths)} {file_extension} files")
    return process_multiple_files(file_paths, EXTRACTION_PATTERNS)

# Method 2: Process specific files
def process_specific_files(file_list):
    """
    Process a specific list of files
    """
    return process_multiple_files(file_list, EXTRACTION_PATTERNS)


In [2]:
df_from_directory = process_files_from_directory('.', '.txt')

if not df_from_directory.empty:
    print("\nDataFrame created successfully!")
    print(f"Shape: {df_from_directory.shape}")
    print("\nColumn names:")
    print(df_from_directory.columns.tolist())
    print("\nFirst few rows:")
    print(df_from_directory.head())
    
    # Display data types
    print("\nData types:")
    print(df_from_directory.dtypes)
    
    # Check for missing values
    print("\nMissing values:")
    print(df_from_directory.isnull().sum())
else:
    print("No DataFrame created - no files processed successfully")


# Save to CSV (optional)
if not df_from_directory.empty:
    output_filename = 'extracted_data.csv'
    df_from_directory.to_csv(output_filename, index=False)
    print(f"\nData saved to '{output_filename}'")

# Display final DataFrame
print("\n=== Final DataFrame ===")
if not df_from_directory.empty:
    display(df_from_directory)
else:
    print("No data to display")



Found 78 .txt files
Processing file: classifier_output_20250918_134953.txt
Processing file: classifier_output_20250918_154822.txt
Processing file: classifier_output_20250917_160250.txt
Processing file: classifier_output_20250827_233030.txt
Processing file: classifier_output_20250918_140106.txt
Processing file: classifier_output_20250830_002448.txt
Processing file: classifier_output_20250825_054832.txt
Processing file: classifier_output_20250918_102312.txt
Processing file: classifier_output_20250827_093352.txt
Processing file: classifier_output_20250831_225940.txt
Processing file: classifier_output_20250828_132708.txt
Processing file: classifier_output_20250918_125306.txt
Processing file: classifier_output_20250918_151532.txt
Processing file: classifier_output_20250825_141316.txt
Processing file: classifier_output_20250918_101643.txt
Processing file: classifier_output_20250827_122602.txt
Processing file: classifier_output_20250918_132107.txt
Processing file: classifier_output_20250825_0

Unnamed: 0,Tumor type,C,l1_ratio,Proteins selected,Ridge HP,MCC Train,MCC Test,TN,FP,FN,TP,Source_File
0,"'MEL', 'UM'",,,,1.0,0.6792 ± 0.0962,0.784243,452.0,4.0,1.0,9.0,classifier_output_20250918_134953.txt
1,"'PLEMESO', 'PEMESO'",,,,0.1,0.8783 ± 0.1115,0.893459,461.0,1.0,0.0,4.0,classifier_output_20250918_154822.txt
2,'ACYC',,,,1.0,0.9340 ± 0.0501,0.879690,442.0,5.0,0.0,18.0,classifier_output_20250917_160250.txt
3,'SFT',2.0,0.3,118.0,0.1,0.9461 ± 0.1270,0.721299,438.0,13.0,0.0,15.0,classifier_output_20250827_233030.txt
4,'MFH',,,,1.0,0.3143 ± 0.0664,0.212453,402.0,51.0,6.0,7.0,classifier_output_20250918_140106.txt
...,...,...,...,...,...,...,...,...,...,...,...,...
73,"'SDCA', 'MYEC'",5.0,0.3,229.0,1.0,0.5385 ± 0.1477,,380.0,1.0,7.0,0.0,classifier_output_20250825_054353.txt
74,'MRLS',,,,0.1,0.9279 ± 0.0716,0.879994,457.0,2.0,0.0,7.0,classifier_output_20250918_144222.txt
75,'DDLS',,,,0.1,0.9499 ± 0.0562,0.924813,459.0,1.0,0.0,6.0,classifier_output_20250918_115515.txt
76,'PANET',,,,1.0,0.4668 ± 0.1322,0.379045,438.0,22.0,1.0,5.0,classifier_output_20250918_153157.txt
