In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Coronary_artery_disease"
cohort = "GSE86216"

# Input paths
in_trait_dir = "../../input/GEO/Coronary_artery_disease"
in_cohort_dir = "../../input/GEO/Coronary_artery_disease/GSE86216"

# Output paths
out_data_file = "../../output/preprocess/Coronary_artery_disease/GSE86216.csv"
out_gene_data_file = "../../output/preprocess/Coronary_artery_disease/gene_data/GSE86216.csv"
out_clinical_data_file = "../../output/preprocess/Coronary_artery_disease/clinical_data/GSE86216.csv"
json_path = "../../output/preprocess/Coronary_artery_disease/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
import os
import json
import pandas as pd
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# Looking at the background information, this study analyzes transcriptomic profile in PBMCs
# and sample characteristics clearly mention "cell type: PBMC" which indicates gene expression data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Trait Data Availability
# From the background information, we know this is a study on coronary artery disease,
# but there's no explicit trait variable in the sample characteristics.
# However, we can use the "treatment" information as a proxy for disease severity since
# all patients have multivessel CAD and are receiving statin treatment
trait_row = 1  # The row with treatment information

# Age information is not available in the sample characteristics
age_row = None

# Gender information is not available in the sample characteristics
gender_row = None

# 2.2 Data Type Conversion Functions
def convert_trait(value):
    """
    Convert treatment status to a binary trait representation.
    control = 0, rosuvastatin treatment = 1
    """
    if ":" not in value:
        return None
    
    value = value.split(":", 1)[1].strip().lower()
    
    if "control" in value:
        return 0
    elif "rosuvastatin" in value:
        return 1
    else:
        return None

def convert_age(value):
    # Since age data is not available, this function is a placeholder
    return None

def convert_gender(value):
    # Since gender data is not available, this function is a placeholder
    return None

# 3. Save Metadata
# Check if trait data is available (trait_row is not None)
is_trait_available = trait_row is not None

# Validate and save cohort info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we proceed with clinical feature extraction
# Use the sample characteristics dictionary from the provided output
sample_characteristics = {0: ['sibship (sampleid): 1', 'sibship (sampleid): 3', 'sibship (sampleid): 4', 'sibship (sampleid): 5', 'sibship (sampleid): 9', 'sibship (sampleid): 10', 'sibship (sampleid): 12', 'sibship (sampleid): 13', 'sibship (sampleid): 15', 'sibship (sampleid): 16', 'sibship (sampleid): 17', 'sibship (sampleid): 18', 'sibship (sampleid): 19', 'sibship (sampleid): 20', 'sibship (sampleid): 21', 'sibship (sampleid): 23', 'sibship (sampleid): 24', 'sibship (sampleid): 25', 'sibship (sampleid): 26', 'sibship (sampleid): 27', 'sibship (sampleid): 28', 'sibship (sampleid): 29', 'sibship (sampleid): 30', 'sibship (sampleid): 31', 'sibship (sampleid): 35', 'sibship (sampleid): 36', 'sibship (sampleid): 38', 'sibship (sampleid): 40', 'sibship (sampleid): 47', 'sibship (sampleid): 48'], 1: ['treatment: control', 'treatment: 40 mg rosuvastatin every day for 8-12 weeks'], 2: ['time: BaseLine', 'time: FollowUp'], 3: ['cell type: PBMC']}

# We need to transform this dictionary into a proper DataFrame format
# Each column will be a feature and each row is a sample
max_samples = max(len(values) for values in sample_characteristics.values())
clinical_data = pd.DataFrame(index=range(max_samples))

# Add each feature as a column, padding with NaN for missing values
for feature_idx, values in sample_characteristics.items():
    # Create a series with the right length, filled with NaN for missing values
    series = pd.Series([values[i] if i < len(values) else float('nan') for i in range(max_samples)])
    clinical_data[feature_idx] = series

# Extract clinical features
selected_clinical_df = geo_select_clinical_features(
    clinical_df=clinical_data,
    trait=trait,
    trait_row=trait_row,
    convert_trait=convert_trait,
    age_row=age_row,
    convert_age=convert_age,
    gender_row=gender_row,
    convert_gender=convert_gender
)

# Preview the data
preview = preview_df(selected_clinical_df)
print("Clinical Data Preview:")
print(preview)

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)

# Save clinical data to CSV
selected_clinical_df.to_csv(out_clinical_data_file)


### Step 3: Dataset Analysis and Clinical Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import os
import re
import json
import gzip
from typing import Optional, Callable, Dict, Any, List

# Function to parse GEO series matrix file
def parse_geo_matrix(file_path):
    """Parse GEO series matrix file to extract sample info and clinical data."""
    sample_info = []
    sample_chars = {}
    
    # Open the gzipped file
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            if line.startswith('!Series_'):
                sample_info.append(line.strip())
            elif line.startswith('!Sample_characteristics_ch1'):
                parts = line.strip().split('\t')
                header = parts[0]
                values = parts[1:]
                
                # Extract the characteristic name if possible
                char_name = header
                for i, value in enumerate(values):
                    if i not in sample_chars:
                        sample_chars[i] = []
                    sample_chars[i].append(value)
    
    # Convert sample characteristics to DataFrame
    clinical_data = pd.DataFrame()
    if sample_chars:
        for i, chars in sample_chars.items():
            sample_name = f"Sample_{i+1}"
            clinical_data[sample_name] = chars
        # Add row index as first column
        clinical_data.insert(0, 'characteristic', [f"Row_{i}" for i in range(len(chars))])
    
    return '\n'.join(sample_info), clinical_data

# Find the series matrix file
matrix_file_path = os.path.join(in_cohort_dir, "GSE86216_series_matrix.txt.gz")
print(f"Loading data from: {matrix_file_path}")

# Parse the GEO matrix file
if os.path.exists(matrix_file_path):
    sample_info, clinical_data = parse_geo_matrix(matrix_file_path)
    print("Sample Info Preview:")
    print(sample_info[:500])
    print("\nClinical Data Shape:", clinical_data.shape)
    if not clinical_data.empty:
        print("Clinical Data Preview:")
        print(clinical_data.head())
else:
    print("Series matrix file not found!")
    is_gene_available = False
    is_trait_available = False
    validate_and_save_cohort_info(
        is_final=False,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=is_gene_available,
        is_trait_available=is_trait_available
    )
    # Exit early as no data is available
    import sys
    sys.exit(0)

# 1. Check for gene expression data availability
is_gene_available = True  # Typically, a series matrix file contains gene expression data

# 2. Look at unique values in each row to identify trait, age, and gender
if not clinical_data.empty:
    print("\nUnique values in clinical data rows:")
    for i in range(len(clinical_data)):
        unique_values = clinical_data.iloc[i, 1:].unique()
        print(f"Row {i}: Unique values: {unique_values}")

# 2.1 Identify rows for trait, age, and gender
trait_row = None
age_row = None
gender_row = None

# Examine each row to identify trait, age, and gender information
for i in range(len(clinical_data)):
    row_values = clinical_data.iloc[i, 1:].astype(str).tolist()
    row_text = ' '.join(row_values).lower()
    
    # Look for trait-related information (Coronary_artery_disease)
    if any(term in row_text for term in ["cad", "coronary", "artery", "disease", "case", "control", "diagnosis"]):
        trait_row = i
        print(f"Found trait information in row {i}")
    
    # Look for age information
    if "age" in row_text:
        age_row = i
        print(f"Found age information in row {i}")
    
    # Look for gender information
    if any(term in row_text for term in ["gender", "sex", "male", "female"]):
        gender_row = i
        print(f"Found gender information in row {i}")

# 2.2 Define conversion functions

def extract_value(text):
    """Extract the value after a colon if present."""
    if isinstance(text, str) and ':' in text:
        return text.split(':', 1)[1].strip()
    return text

def convert_trait(value):
    """Convert trait value to binary (0 for control, 1 for case)."""
    if value is None or pd.isna(value):
        return None
    
    value = extract_value(str(value)).lower()
    
    if any(term in value for term in ["cad", "coronary artery disease", "case", "yes", "positive", "patient"]):
        return 1
    elif any(term in value for term in ["control", "no", "negative", "healthy", "normal"]):
        return 0
    return None

def convert_age(value):
    """Convert age value to continuous."""
    if value is None or pd.isna(value):
        return None
    
    value = extract_value(str(value))
    
    # Try to extract numeric value
    matches = re.findall(r'\d+(?:\.\d+)?', str(value))
    if matches:
        try:
            return float(matches[0])
        except:
            return None
    return None

def convert_gender(value):
    """Convert gender value to binary (0 for female, 1 for male)."""
    if value is None or pd.isna(value):
        return None
    
    value = extract_value(str(value)).lower()
    
    if any(term in value for term in ["female", "f", "woman", "women"]):
        return 0
    elif any(term in value for term in ["male", "m", "man", "men"]):
        return 1
    return None

# 3. Save metadata - conduct initial filtering
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction (if trait_row is not None)
if trait_row is not None:
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age if age_row is not None else None,
        gender_row=gender_row,
        convert_gender=convert_gender if gender_row is not None else None
    )
    
    # Preview the extracted clinical features
    preview = preview_df(selected_clinical_df)
    print("\nSelected Clinical Features Preview:")
    print(preview)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Save the extracted clinical features to CSV
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Saved clinical data to {out_clinical_data_file}")


### Step 4: Gene Data Extraction

In [None]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


### Step 5: Gene Identifier Review

In [None]:
# The gene identifiers start with "ILMN_", which indicates they are Illumina probe IDs
# These are not human gene symbols but rather probe identifiers used in Illumina microarray platforms
# We will need to map these probe IDs to human gene symbols for biological interpretation

requires_gene_mapping = True


### Step 6: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Looking at the output, the Symbol column seems to contain gene information
print("\nExamining ID and Symbol columns format (first 3 rows):")
if 'ID' in gene_annotation.columns and 'Symbol' in gene_annotation.columns:
    for i in range(min(3, len(gene_annotation))):
        print(f"Row {i}: ID={gene_annotation['ID'].iloc[i]}")
        print(f"Symbol: {gene_annotation['Symbol'].iloc[i]}")

    # Check the quality and completeness of the mapping
    non_null_symbols = gene_annotation['Symbol'].notna().sum()
    total_rows = len(gene_annotation)
    print(f"\nSymbol column completeness: {non_null_symbols}/{total_rows} rows ({non_null_symbols/total_rows:.2%})")
    
    # Check if some extracted gene symbols can be found in the Symbol column
    print("\nAttempting to extract gene symbols from the first few rows:")
    for i in range(min(3, len(gene_annotation))):
        if pd.notna(gene_annotation['Symbol'].iloc[i]):
            symbols = extract_human_gene_symbols(str(gene_annotation['Symbol'].iloc[i]))
            print(f"Row {i} extracted symbols: {symbols}")


### Step 7: Gene Identifier Mapping

In [None]:
# 1. Re-extract the files and gene annotation
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Identify columns for gene mapping
prob_col = 'ID'
gene_col = 'Symbol'

# Get gene mapping dataframe by extracting the two columns
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)
print(f"\nGene mapping dataframe shape: {mapping_df.shape}")
print(f"Mapping preview (first 5 rows):")
print(preview_df(mapping_df, n=5))

# Load the gene expression data
gene_data = get_genetic_data(matrix_file)
print(f"Gene expression data shape before mapping: {gene_data.shape}")
print("First 5 probes before mapping:")
print(gene_data.index[:5].tolist())

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
print("\nConverting probe-level measurements to gene expression data...")
mapped_gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Gene expression data shape after mapping: {mapped_gene_data.shape}")
print("First 5 genes after mapping:")
print(mapped_gene_data.index[:5].tolist())

# Check if any mapping happened by comparing number of unique identifiers before and after
num_probes_before = gene_data.shape[0]
num_genes_after = mapped_gene_data.shape[0]
print(f"\nBefore mapping: {num_probes_before} unique probe IDs")
print(f"After mapping: {num_genes_after} unique gene symbols")

# Save the gene expression data to a CSV file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
mapped_gene_data.to_csv(out_gene_data_file)
print(f"\nSaved gene expression data to {out_gene_data_file}")


### Step 8: Data Normalization and Linking

In [None]:
# 1. Normalize gene symbols in the gene expression data
try:
    # Make sure the directory exists
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    
    # Use the gene_data variable from the previous step (don't try to load it from file)
    print(f"Gene data shape before normalization: {gene_data.shape}")
    
    # Apply normalization to gene symbols
    normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
    print(f"Gene data shape after normalization: {normalized_gene_data.shape}")
    
    # Save the normalized gene data
    normalized_gene_data.to_csv(out_gene_data_file)
    print(f"Normalized gene data saved to {out_gene_data_file}")
    
    # Use the normalized data for further processing
    gene_data = normalized_gene_data
    is_gene_available = True
except Exception as e:
    print(f"Error normalizing gene data: {e}")
    is_gene_available = False

# 2. Load clinical data - respecting the analysis from Step 2
# From Step 2, we determined:
# trait_row = None  # No Breast Cancer subtype data available
# age_row = 2
# gender_row = None
is_trait_available = trait_row is not None

# Skip clinical feature extraction when trait_row is None
if is_trait_available:
    try:
        # Load the clinical data from file
        soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
        background_info, clinical_data = get_background_and_clinical_data(matrix_file)
        
        # Extract clinical features
        clinical_features = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            gender_row=gender_row,
            convert_gender=convert_gender,
            age_row=age_row,
            convert_age=convert_age
        )
        
        print(f"Extracted clinical data shape: {clinical_features.shape}")
        print("Preview of clinical data (first 5 samples):")
        print(clinical_features.iloc[:, :5])
        
        # Save the properly extracted clinical data
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_features.to_csv(out_clinical_data_file)
        print(f"Clinical data saved to {out_clinical_data_file}")
    except Exception as e:
        print(f"Error extracting clinical data: {e}")
        is_trait_available = False
else:
    print(f"No trait data ({trait}) available in this dataset based on previous analysis.")

# 3. Link clinical and genetic data if both are available
if is_trait_available and is_gene_available:
    try:
        # Debug the column names to ensure they match
        print(f"Gene data columns (first 5): {gene_data.columns[:5].tolist()}")
        print(f"Clinical data columns (first 5): {clinical_features.columns[:5].tolist()}")
        
        # Check for common sample IDs
        common_samples = set(gene_data.columns).intersection(clinical_features.columns)
        print(f"Found {len(common_samples)} common samples between gene and clinical data")
        
        if len(common_samples) > 0:
            # Link the clinical and genetic data
            linked_data = geo_link_clinical_genetic_data(clinical_features, gene_data)
            print(f"Initial linked data shape: {linked_data.shape}")
            
            # Debug the trait values before handling missing values
            print("Preview of linked data (first 5 rows, first 5 columns):")
            print(linked_data.iloc[:5, :5])
            
            # Handle missing values
            linked_data = handle_missing_values(linked_data, trait)
            print(f"Linked data shape after handling missing values: {linked_data.shape}")
            
            if linked_data.shape[0] > 0:
                # Check for bias in trait and demographic features
                is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
                
                # Validate the data quality and save cohort info
                note = "Dataset contains gene expression data from triple negative breast cancer vs. luminal tumors, but no explicit breast cancer subtype labels in the sample characteristics."
                is_usable = validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=is_biased,
                    df=linked_data,
                    note=note
                )
                
                # Save the linked data if it's usable
                if is_usable:
                    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
                    linked_data.to_csv(out_data_file)
                    print(f"Linked data saved to {out_data_file}")
                else:
                    print("Data not usable for the trait study - not saving final linked data.")
            else:
                print("After handling missing values, no samples remain.")
                validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=True,
                    df=pd.DataFrame(),
                    note="No valid samples after handling missing values."
                )
        else:
            print("No common samples found between gene expression and clinical data.")
            validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=is_gene_available,
                is_trait_available=is_trait_available,
                is_biased=True,
                df=pd.DataFrame(),
                note="No common samples between gene expression and clinical data."
            )
    except Exception as e:
        print(f"Error linking or processing data: {e}")
        validate_and_save_cohort_info(
            is_final=True,
            cohort=cohort,
            info_path=json_path,
            is_gene_available=is_gene_available,
            is_trait_available=is_trait_available,
            is_biased=True,  # Assume biased if there's an error
            df=pd.DataFrame(),  # Empty dataframe for metadata
            note=f"Error in data processing: {str(e)}"
        )
else:
    # Create an empty DataFrame for metadata purposes
    empty_df = pd.DataFrame()
    
    # We can't proceed with linking if either trait or gene data is missing
    print("Cannot proceed with data linking due to missing trait or gene data.")
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=is_gene_available,
        is_trait_available=is_trait_available,
        is_biased=True,  # Data is unusable if we're missing components
        df=empty_df,  # Empty dataframe for metadata
        note="Dataset contains gene expression data from triple negative breast cancer vs. luminal tumors, but no explicit breast cancer subtype labels in the sample characteristics."
    )