In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Coronary_artery_disease"
cohort = "GSE54975"

# Input paths
in_trait_dir = "../../input/GEO/Coronary_artery_disease"
in_cohort_dir = "../../input/GEO/Coronary_artery_disease/GSE54975"

# Output paths
out_data_file = "../../output/preprocess/Coronary_artery_disease/GSE54975.csv"
out_gene_data_file = "../../output/preprocess/Coronary_artery_disease/gene_data/GSE54975.csv"
out_clinical_data_file = "../../output/preprocess/Coronary_artery_disease/clinical_data/GSE54975.csv"
json_path = "../../output/preprocess/Coronary_artery_disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Lipid-induced epigenomic changes in human macrophages identify a coronary artery disease associated variant that regulates PPAP2B expression through altered C/EBP-beta binding"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series"
Sample Characteristics Dictionary:
{0: ['background: European', 'background: Nepalese'], 1: ['cell type: Macrophage'], 2: ['agent: oxLDL', 'agent: control buffer']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# Based on the title and characteristics, this dataset appears to focus on epigenomic changes and gene expression
# related to coronary artery disease, which suggests gene expression data is available
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Looking at the sample characteristics dictionary:
# - trait (disease status): Not explicitly available in the characteristics
# - age: Not available in the characteristics
# - gender: Not available in the characteristics

# For trait, we can infer from agent field (key 2): oxLDL vs control buffer
trait_row = 2 
age_row = None  # Age data not available
gender_row = None  # Gender data not available

# 2.2 Data Type Conversion

# For trait (CAD status), we'll convert from the agent field:
# "agent: oxLDL" indicates treated/case sample (1)
# "agent: control buffer" indicates control (0)
def convert_trait(value):
    if not isinstance(value, str):
        return None
    value = value.lower().strip()
    if 'agent:' in value:
        value = value.split('agent:')[1].strip()
        if 'oxldl' in value:
            return 1  # Treated with oxidized LDL (case)
        elif 'control' in value:
            return 0  # Control
    return None

# Age conversion function (not used but defined for completeness)
def convert_age(value):
    return None

# Gender conversion function (not used but defined for completeness)
def convert_gender(value):
    return None

# 3. Save Metadata
# Determine if trait data is available
is_trait_available = trait_row is not None

# Initial filtering on usability
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if trait_row is not None:
    try:
        # Use the clinical_data variable that should be available from previous steps
        selected_clinical_df = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the extracted clinical features
        print("Clinical Features Preview:")
        preview = preview_df(selected_clinical_df)
        print(preview)
        
        # Save the clinical features to a CSV file
        selected_clinical_df.to_csv(out_clinical_data_file)
        print(f"Clinical data saved to {out_clinical_data_file}")
        
    except Exception as e:
        print(f"Error in clinical feature extraction: {e}")
        is_trait_available = False


Clinical Features Preview:
{'GSM1321503': [1.0], 'GSM1321504': [0.0], 'GSM1321505': [1.0], 'GSM1321506': [0.0], 'GSM1321507': [1.0], 'GSM1321508': [0.0], 'GSM1321509': [1.0], 'GSM1321510': [0.0], 'GSM1321511': [1.0], 'GSM1321512': [1.0], 'GSM1321513': [0.0], 'GSM1321514': [0.0]}
Clinical data saved to ../../output/preprocess/Coronary_artery_disease/clinical_data/GSE54975.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


SOFT file: ../../input/GEO/Coronary_artery_disease/GSE54975/GSE54975_family.soft.gz
Matrix file: ../../input/GEO/Coronary_artery_disease/GSE54975/GSE54975-GPL10558_series_matrix.txt.gz
Found the matrix table marker at line 71
Gene data shape: (47231, 12)
First 20 gene/probe identifiers:
['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209', 'ILMN_1651210', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229', 'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651235', 'ILMN_1651236', 'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253', 'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651262']


### Step 4: Gene Identifier Review

In [5]:
# The identifiers shown (ILMN_xxxxxxx) are Illumina probe IDs from the Illumina BeadChip platform (GPL10558)
# These are not human gene symbols and will need to be mapped to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Examine the ID and Symbol columns that appear to contain the mapping information
print("\nExamining mapping information (first 5 rows):")
if 'ID' in gene_annotation.columns and 'Symbol' in gene_annotation.columns:
    for i in range(min(5, len(gene_annotation))):
        print(f"Row {i}: ID={gene_annotation['ID'].iloc[i]}, Symbol={gene_annotation['Symbol'].iloc[i]}")
    
    # Check the quality and completeness of the mapping
    non_null_symbols = gene_annotation['Symbol'].notna().sum()
    total_rows = len(gene_annotation)
    print(f"\nSymbol column completeness: {non_null_symbols}/{total_rows} rows ({non_null_symbols/total_rows:.2%})")
    
    # Identify the columns needed for gene mapping
    print("\nColumns identified for gene mapping:")
    print("- 'ID': Contains Illumina probe IDs (e.g., ILMN_*)")
    print("- 'Symbol': Contains gene symbols")



Gene annotation preview:
Columns in gene annotation: ['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene', 'Source_Reference_ID', 'RefSeq_ID', 'Unigene_ID', 'Entrez_Gene_ID', 'GI', 'Accession', 'Symbol', 'Protein_Product', 'Probe_Id', 'Array_Address_Id', 'Probe_Type', 'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation', 'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component', 'Ontology_Process', 'Ontology_Function', 'Synonyms', 'Obsolete_Probe_Id', 'GB_ACC']
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050'], 'Species': [nan, nan, nan], 'Source': [nan, nan, nan], 'Search_Key': [nan, nan, nan], 'Transcript': [nan, nan, nan], 'ILMN_Gene': [nan, nan, nan], 'Source_Reference_ID': [nan, nan, nan], 'RefSeq_ID': [nan, nan, nan], 'Unigene_ID': [nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan], 'GI': [nan, nan, nan], 'Accession': [nan, nan, nan], 'Symbol': ['phage_lambda_genome', 'phage_lambda_genome', 'phage_lambda_genome:low'], 'Protein_Product':

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Create a mapping dataframe from the gene annotation
# Based on previous outputs, we identified:
# - 'ID' column contains Illumina probe IDs that match the gene expression data index
# - 'Symbol' column contains gene symbols we need to map to

# Get the mapping dataframe with 'ID' and 'Symbol' columns
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='Symbol')

# Check the mapping dataframe 
print(f"Gene mapping dataframe shape: {gene_mapping.shape}")
print("Gene mapping preview:")
print(preview_df(gene_mapping, n=5))

# 2. Read genetic data again to ensure we're working with the correct data
gene_data_raw = get_genetic_data(matrix_file)
print(f"Gene expression data shape (before mapping): {gene_data_raw.shape}")

# 3. Apply the gene mapping to convert probe-level measurements to gene-level data
gene_data = apply_gene_mapping(gene_data_raw, gene_mapping)

# Check the processed gene data
print(f"Gene expression data shape (after mapping): {gene_data.shape}")
print("Gene data preview (first 5 genes):")
print(preview_df(gene_data, n=5))

# Save the gene expression data to a CSV file
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Gene mapping dataframe shape: (44837, 2)
Gene mapping preview:
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050', 'ILMN_1343052', 'ILMN_1343059'], 'Gene': ['phage_lambda_genome', 'phage_lambda_genome', 'phage_lambda_genome:low', 'phage_lambda_genome:low', 'thrB']}
Gene expression data shape (before mapping): (47231, 12)
Gene expression data shape (after mapping): (21372, 12)
Gene data preview (first 5 genes):
{'GSM1321503': [14.725556807, 21.877370881, 22.014179287, 28.66824101, 8.37253827], 'GSM1321504': [14.699442151, 21.844561159, 22.021686406, 28.699002439, 8.579392591], 'GSM1321505': [14.689998565, 22.035022234, 22.092588802999998, 28.604531031, 8.075103553], 'GSM1321506': [14.768962991999999, 21.647306364000002, 21.625944806, 28.692870384, 8.293847264], 'GSM1321507': [14.825603059999999, 21.810194296, 21.507765248, 28.665816932, 7.943434895], 'GSM1321508': [14.879285424999999, 21.974036685, 21.563995386000002, 28.804042170000002, 8.219964587], 'GSM1321509': [14.620800768, 21

Gene expression data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE54975.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Attempt to load gene data and handle possible issues with normalization
try:
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    
    # Check if gene_data (from previous step) has any content
    if gene_data.shape[0] == 0:
        print("WARNING: Gene data is empty after normalization in previous step.")
        print("This appears to be miRNA data rather than gene expression data.")
        
        # Since gene_data is empty, set gene_available to False
        is_gene_available = False
        
        # Create an empty dataframe for metadata purposes
        empty_df = pd.DataFrame()
        
        # Log information about this dataset for future reference
        validate_and_save_cohort_info(
            is_final=True,
            cohort=cohort,
            info_path=json_path,
            is_gene_available=is_gene_available,
            is_trait_available=is_trait_available,
            is_biased=True,  # Consider it biased as we can't use it
            df=empty_df,
            note="Dataset appears to contain miRNA data rather than gene expression data. Gene symbols could not be normalized."
        )
        
        print("Dataset marked as unusable due to lack of valid gene expression data.")
    else:
        # If gene_data is not empty, proceed with normalization and linking
        print(f"Gene data shape before normalization: {gene_data.shape}")
        
        # Save the gene data we have, even if it's already normalized
        gene_data.to_csv(out_gene_data_file)
        print(f"Gene data saved to {out_gene_data_file}")
        
        # Attempt to link clinical and gene data
        if is_trait_available:
            # Load clinical data
            clinical_features = pd.read_csv(out_clinical_data_file, index_col=0)
            print(f"Loaded clinical data shape: {clinical_features.shape}")
            
            # Link the clinical and genetic data
            linked_data = geo_link_clinical_genetic_data(clinical_features, gene_data)
            print(f"Initial linked data shape: {linked_data.shape}")
            
            # Handle missing values
            linked_data = handle_missing_values(linked_data, trait)
            print(f"Linked data shape after handling missing values: {linked_data.shape}")
            
            if linked_data.shape[0] > 0:
                # Check for bias in trait and demographic features
                is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
                
                # Validate data quality and save cohort info
                is_usable = validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=is_biased,
                    df=linked_data,
                    note="Successfully processed gene expression data for coronary artery disease."
                )
                
                # Save the linked data if it's usable
                if is_usable:
                    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
                    linked_data.to_csv(out_data_file)
                    print(f"Linked data saved to {out_data_file}")
                else:
                    print("Data not usable for trait study - not saving final linked data.")
            else:
                print("After handling missing values, no samples remain.")
                validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=True,
                    df=pd.DataFrame(),
                    note="No valid samples after handling missing values."
                )
        else:
            # Cannot proceed with linking if trait data is missing
            validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=is_gene_available,
                is_trait_available=is_trait_available,
                is_biased=True,
                df=pd.DataFrame(),
                note="Cannot link data because trait information is not available."
            )
except Exception as e:
    print(f"Error in data processing: {e}")
    
    # Log the error and mark the dataset as unusable
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=False,  # Consider gene data unavailable if we had an error
        is_trait_available=is_trait_available,
        is_biased=True,  # Consider it biased as we can't use it
        df=pd.DataFrame(),  # Empty dataframe for metadata
        note=f"Error during normalization or linking: {str(e)}"
    )

Gene data shape before normalization: (21372, 12)
Gene data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE54975.csv
Loaded clinical data shape: (1, 12)
Initial linked data shape: (12, 21373)


Linked data shape after handling missing values: (12, 21373)
For the feature 'Coronary_artery_disease', the least common label is '1.0' with 6 occurrences. This represents 50.00% of the dataset.
The distribution of the feature 'Coronary_artery_disease' in this dataset is fine.



Linked data saved to ../../output/preprocess/Coronary_artery_disease/GSE54975.csv
