In [3]:
# Text File Data Extraction to Pandas DataFrame

import pandas as pd
import re
import os
from pathlib import Path

# Define extraction patterns and column mappings
EXTRACTION_PATTERNS = [
    {
        'pattern': r"Target Class: \['([^']+)'\]",
        'column': 'Tumor type',
        'description': 'Extract tumor type from Target Class'
    },
    {
        'pattern': r"Best parameters: \{[^}]*'C': ([^,}]+)",
        'column': 'C',
        'description': 'Extract C parameter from Best parameters',
        'convert_type': float
    },
    {
        'pattern': r"Best parameters: \{[^}]*'l1_ratio': ([^,}]+)",
        'column': 'l1_ratio',
        'description': 'Extract l1_ratio from Best parameters',
        'convert_type': float
    },
    {
        'pattern': r"Number of significant proteins: (\d+)",
        'column': 'Proteins selected',
        'description': 'Extract number of significant proteins',
        'convert_type': int
    },
    {
        'pattern': r"Selected hyperparameter C: ([^\s;]+)",
        'column': 'Ridge HP',
        'description': 'Extract Ridge hyperparameter C',
        'convert_type': float
    },
    {
        'pattern': r"Train MCC Score: ([\d\.]+ ± [\d\.]+)",
        'column': 'MCC Train',
        'description': 'Extract Train MCC Score with standard deviation'
    },
    {
        'pattern': r"MCC test: ([\d\.]+)",
        'column': 'MCC Test',
        'description': 'Extract MCC test score',
        'convert_type': float
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'TN',
        'description': 'Extract True Negatives from confusion matrix',
        'convert_type': int,
        'group': 1  # First capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'FP',
        'description': 'Extract False Positives from confusion matrix',
        'convert_type': int,
        'group': 2  # Second capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'FN',
        'description': 'Extract False Negatives from confusion matrix',
        'convert_type': int,
        'group': 3  # Third capture group
    },
    {
        'pattern': r"Confusion Matrix:\s+TN \| FP\s+\[\[(\d+)\s+(\d+)\]\s+\[\s*(\d+)\s+(\d+)\]\]\s+FN \| TP",
        'column': 'TP',
        'description': 'Extract True Positives from confusion matrix',
        'convert_type': int,
        'group': 4  # Fourth capture group
    }
]

def extract_data_from_text(text, patterns):
    """
    Extract data from text using defined patterns
    
    Args:
        text (str): Input text to process
        patterns (list): List of pattern dictionaries
    
    Returns:
        dict: Extracted data with column names as keys
    """
    extracted_data = {}
    
    for pattern_info in patterns:
        pattern = pattern_info['pattern']
        column = pattern_info['column']
        convert_type = pattern_info.get('convert_type', str)
        
        # In the extract_data_from_text function, modify the matching section:
        match = re.search(pattern, text)
        if match:
            group_num = pattern_info.get('group', 1)  # Default to group 1
            value = match.group(group_num).strip()
            
            # Convert type if specified
            if convert_type != str:
                try:
                    value = convert_type(value)
                except (ValueError, TypeError):
                    print(f"Warning: Could not convert '{value}' to {convert_type} for column '{column}'")
            
            extracted_data[column] = value
        else:
            extracted_data[column] = None
            print(f"Warning: Pattern for '{column}' not found in text")
    
    return extracted_data

def process_single_file(file_path, patterns):
    """
    Process a single text file and extract data
    
    Args:
        file_path (str): Path to the text file
        patterns (list): List of extraction patterns
    
    Returns:
        dict: Extracted data from the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        print(f"Processing file: {file_path}")
        extracted_data = extract_data_from_text(text, patterns)
        
        # Add filename for reference
        extracted_data['Source_File'] = os.path.basename(file_path)
        
        return extracted_data
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found")
        return None
    except Exception as e:
        print(f"Error processing file '{file_path}': {str(e)}")
        return None

def process_multiple_files(file_paths, patterns):
    """
    Process multiple text files and combine into DataFrame
    
    Args:
        file_paths (list): List of file paths to process
        patterns (list): List of extraction patterns
    
    Returns:
        pandas.DataFrame: Combined data from all files
    """
    all_data = []
    
    for file_path in file_paths:
        file_data = process_single_file(file_path, patterns)
        if file_data:
            all_data.append(file_data)
    
    if all_data:
        df = pd.DataFrame(all_data)
        return df
    else:
        print("No data extracted from any files")
        return pd.DataFrame()

# Example usage:

# Method 1: Process files from a directory
def process_files_from_directory(directory_path, file_extension='.txt'):
    """
    Process all files with specified extension from a directory
    """
    directory = Path(directory_path)
    if not directory.exists():
        print(f"Directory '{directory_path}' does not exist")
        return pd.DataFrame()
    
    file_paths = list(directory.glob(f'*{file_extension}'))
    if not file_paths:
        print(f"No {file_extension} files found in '{directory_path}'")
        return pd.DataFrame()
    
    print(f"Found {len(file_paths)} {file_extension} files")
    return process_multiple_files(file_paths, EXTRACTION_PATTERNS)

# Method 2: Process specific files
def process_specific_files(file_list):
    """
    Process a specific list of files
    """
    return process_multiple_files(file_list, EXTRACTION_PATTERNS)


In [4]:

# Example: Process all txt files in current directory
print("=== Processing files from current directory ===")
df_from_directory = process_files_from_directory('.', '.txt')

if not df_from_directory.empty:
    print("\nDataFrame created successfully!")
    print(f"Shape: {df_from_directory.shape}")
    print("\nColumn names:")
    print(df_from_directory.columns.tolist())
    print("\nFirst few rows:")
    print(df_from_directory.head())
    
    # Display data types
    print("\nData types:")
    print(df_from_directory.dtypes)
    
    # Check for missing values
    print("\nMissing values:")
    print(df_from_directory.isnull().sum())
else:
    print("No DataFrame created - no files processed successfully")

# Example: Process specific files (uncomment and modify as needed)
"""
specific_files = [
    'file1.txt',
    'file2.txt', 
    'file3.txt'
]
df_specific = process_specific_files(specific_files)
"""

# Save to CSV (optional)
if not df_from_directory.empty:
    output_filename = 'extracted_data.csv'
    df_from_directory.to_csv(output_filename, index=False)
    print(f"\nData saved to '{output_filename}'")

# Display final DataFrame
print("\n=== Final DataFrame ===")
if not df_from_directory.empty:
    display(df_from_directory)
else:
    print("No data to display")

# Function to test patterns on sample text
def test_patterns_on_sample():
    """
    Test the extraction patterns on sample text
    """
    sample_text = """
    Target Class: ['ACYC']
    Best parameters: {'C': 2, 'l1_ratio': 0.5}
    Number of significant proteins: 20
    Selected hyperparameter C: 10
    Train MCC Score: 0.7104 ± 0.0940
    MCC test: 0.591327405378887
    """
    
    print("=== Testing patterns on sample text ===")
    print("Sample text:")
    print(sample_text)
    print("\nExtracted data:")
    
    extracted = extract_data_from_text(sample_text, EXTRACTION_PATTERNS)
    for key, value in extracted.items():
        print(f"{key}: {value} (type: {type(value).__name__})")

# Uncomment to test patterns
# test_patterns_on_sample()

=== Processing files from current directory ===
Found 26 .txt files
Processing file: classifier_output_20250827_233030.txt
Processing file: classifier_output_20250825_054832.txt
Processing file: classifier_output_20250827_093352.txt
Processing file: classifier_output_20250825_141316.txt
Processing file: classifier_output_20250827_122602.txt
Processing file: classifier_output_20250825_012355.txt
Processing file: classifier_output_20250827_200430.txt
Processing file: classifier_output_20250825_100824.txt
Processing file: classifier_output_20250824_184947.txt
Processing file: classifier_output_20250825_020729.txt
Processing file: classifier_output_20250827_230051.txt
Processing file: classifier_output_20250825_081559.txt
Processing file: classifier_output_20250824_223632.txt
Processing file: classifier_output_20250824_221205.txt
Processing file: classifier_output_20250826_165607.txt
Processing file: ttc_pip_requirements.txt
Processing file: classifier_output_20250828_023405.txt
Processing

Unnamed: 0,Tumor type,C,l1_ratio,Proteins selected,Ridge HP,MCC Train,MCC Test,TN,FP,FN,TP,Source_File
0,SFT,2.0,0.3,118.0,0.1,0.9461 ± 0.1270,0.721299,438.0,13.0,0.0,15.0,classifier_output_20250827_233030.txt
1,CHDM,1.0,0.3,48.0,0.1,0.9630 ± 0.0200,0.92209,437.0,3.0,1.0,25.0,classifier_output_20250825_054832.txt
2,ACYC,2.0,0.5,20.0,10.0,0.7104 ± 0.0940,0.591327,426.0,22.0,2.0,16.0,classifier_output_20250827_093352.txt
3,DSRCT,5.0,0.3,339.0,0.1,1.0000 ± 0.0000,1.0,459.0,0.0,0.0,7.0,classifier_output_20250825_141316.txt
4,COAD,2.0,0.5,26.0,10.0,0.3378 ± 0.1566,0.243284,354.0,30.0,1.0,3.0,classifier_output_20250827_122602.txt
5,SDCA,5.0,0.3,208.0,1.0,0.5117 ± 0.2109,0.4408,381.0,2.0,3.0,2.0,classifier_output_20250825_012355.txt
6,SYNS,3.0,0.3,265.0,1.0,0.9463 ± 0.0458,0.976917,443.0,0.0,1.0,22.0,classifier_output_20250827_200430.txt
7,PAAD,5.0,0.3,213.0,1.0,0.3866 ± 0.1988,0.46681,459.0,1.0,4.0,2.0,classifier_output_20250825_100824.txt
8,ARMS,5.0,0.3,183.0,1.0,0.9424 ± 0.0457,0.877439,452.0,1.0,2.0,11.0,classifier_output_20250824_184947.txt
9,ACC,5.0,0.5,224.0,0.1,0.9397 ± 0.0466,0.917842,453.0,2.0,0.0,11.0,classifier_output_20250825_020729.txt
