In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Coronary_artery_disease"
cohort = "GSE234398"

# Input paths
in_trait_dir = "../../input/GEO/Coronary_artery_disease"
in_cohort_dir = "../../input/GEO/Coronary_artery_disease/GSE234398"

# Output paths
out_data_file = "../../output/preprocess/Coronary_artery_disease/GSE234398.csv"
out_gene_data_file = "../../output/preprocess/Coronary_artery_disease/gene_data/GSE234398.csv"
out_clinical_data_file = "../../output/preprocess/Coronary_artery_disease/clinical_data/GSE234398.csv"
json_path = "../../output/preprocess/Coronary_artery_disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Analysis of  gene expression of LPS-stimulated monocyte from CAD patients"
!Series_summary	"Data for the publication 'Identification of a Gene Network Driving the Attenuated Monocyte Response to Lipopolysaccharide of Hypertensive Coronary Artery Disease Patients'."
!Series_summary	"Dissection of the impact of CVD risk factors on monocyte phenotype at the gene expression level, and in particular on their response to trauma and infection response."
!Series_summary	"For any questions about the dataset, please contact Erik Biessen‘s Lab, Department of Pathology, Cardiovascular Research Institute Maastricht (CARIM), Maastricht University Medical Center, Maastricht, Netherlands"
!Series_overall_design	"Total RNA obtained from LPS stimulated monocytes of CAD patients."
Sample Characteristics Dictionary:
{0: ['cell type: monocytes'], 1: ['Sex: male', 'Sex: female'], 2: ['age: 78', 'age: 50', 'age: 67', 'age: 74', 'age: 60', 'age: 72', 'age: 73', 'age: 77'

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains gene expression data from LPS-stimulated monocytes
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Trait: From the background info, this dataset is specifically about CAD patients
# We can use row 0 to represent all samples as having CAD (even though it doesn't explicitly state CAD)
trait_row = 0  # All samples are CAD patients as per the background information

# Age: This is available in row 2
age_row = 2

# Gender: This is available in row 1
gender_row = 1

# 2.2 Data Type Conversion
def convert_trait(value):
    # All samples are CAD patients according to the background info
    return 1  # Binary: 1 = has CAD

def convert_age(value):
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    try:
        return float(value)  # Convert to continuous numeric value
    except (ValueError, TypeError):
        return None

def convert_gender(value):
    if ":" in value:
        value = value.split(":", 1)[1].strip().lower()
    if value == "male":
        return 1
    elif value == "female":
        return 0
    else:
        return None

# 3. Save Metadata - initial filtering
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features
    clinical_selected = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the data
    preview = preview_df(clinical_selected)
    print("Clinical Data Preview:")
    print(preview)
    
    # Save to file
    clinical_selected.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")


Clinical Data Preview:
{'GSM7466724': [1.0, 78.0, 1.0], 'GSM7466725': [1.0, 78.0, 1.0], 'GSM7466726': [1.0, 50.0, 0.0], 'GSM7466727': [1.0, 67.0, 0.0], 'GSM7466728': [1.0, 74.0, 0.0], 'GSM7466729': [1.0, 60.0, 1.0], 'GSM7466730': [1.0, 72.0, 1.0], 'GSM7466731': [1.0, 67.0, 1.0], 'GSM7466732': [1.0, 67.0, 1.0], 'GSM7466733': [1.0, 73.0, 1.0], 'GSM7466734': [1.0, 77.0, 0.0], 'GSM7466735': [1.0, 78.0, 0.0], 'GSM7466736': [1.0, 56.0, 1.0], 'GSM7466737': [1.0, 51.0, 1.0], 'GSM7466738': [1.0, 78.0, 1.0], 'GSM7466739': [1.0, 66.0, 1.0], 'GSM7466740': [1.0, 65.0, 1.0], 'GSM7466741': [1.0, 51.0, 1.0], 'GSM7466742': [1.0, 63.0, 0.0], 'GSM7466743': [1.0, 60.0, 0.0], 'GSM7466744': [1.0, 71.0, 1.0], 'GSM7466745': [1.0, 57.0, 0.0], 'GSM7466746': [1.0, 73.0, 1.0], 'GSM7466747': [1.0, 75.0, 0.0], 'GSM7466748': [1.0, 72.0, 0.0], 'GSM7466749': [1.0, 74.0, 0.0], 'GSM7466750': [1.0, 64.0, 0.0], 'GSM7466751': [1.0, 39.0, 1.0], 'GSM7466752': [1.0, 78.0, 0.0], 'GSM7466753': [1.0, 57.0, 1.0], 'GSM7466754': [1

### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


SOFT file: ../../input/GEO/Coronary_artery_disease/GSE234398/GSE234398_family.soft.gz
Matrix file: ../../input/GEO/Coronary_artery_disease/GSE234398/GSE234398_series_matrix.txt.gz
Found the matrix table marker at line 63


Gene data shape: (47231, 50)
First 20 gene/probe identifiers:
['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209', 'ILMN_1651210', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229', 'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651235', 'ILMN_1651236', 'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253', 'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651262']


### Step 4: Gene Identifier Review

In [5]:
# Examining the gene identifiers
# The identifiers with prefix 'ILMN_' are Illumina probe IDs, not human gene symbols
# These are probe IDs from Illumina microarray platforms and need to be mapped to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Examine the ID and Symbol columns that appear to contain the mapping information
print("\nExamining mapping information (first 5 rows):")
if 'ID' in gene_annotation.columns and 'Symbol' in gene_annotation.columns:
    for i in range(min(5, len(gene_annotation))):
        print(f"Row {i}: ID={gene_annotation['ID'].iloc[i]}, Symbol={gene_annotation['Symbol'].iloc[i]}")
    
    # Check the quality and completeness of the mapping
    non_null_symbols = gene_annotation['Symbol'].notna().sum()
    total_rows = len(gene_annotation)
    print(f"\nSymbol column completeness: {non_null_symbols}/{total_rows} rows ({non_null_symbols/total_rows:.2%})")
    
    # Identify the columns needed for gene mapping
    print("\nColumns identified for gene mapping:")
    print("- 'ID': Contains Illumina probe IDs (e.g., ILMN_*)")
    print("- 'Symbol': Contains gene symbols")
else:
    print("Error: Required mapping columns ('ID' and/or 'Symbol') not found in annotation data.")
    print("Available columns:", gene_annotation.columns.tolist())



Gene annotation preview:
Columns in gene annotation: ['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene', 'Source_Reference_ID', 'RefSeq_ID', 'Unigene_ID', 'Entrez_Gene_ID', 'GI', 'Accession', 'Symbol', 'Protein_Product', 'Probe_Id', 'Array_Address_Id', 'Probe_Type', 'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation', 'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component', 'Ontology_Process', 'Ontology_Function', 'Synonyms', 'Obsolete_Probe_Id', 'GB_ACC']
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050'], 'Species': [nan, nan, nan], 'Source': [nan, nan, nan], 'Search_Key': [nan, nan, nan], 'Transcript': [nan, nan, nan], 'ILMN_Gene': [nan, nan, nan], 'Source_Reference_ID': [nan, nan, nan], 'RefSeq_ID': [nan, nan, nan], 'Unigene_ID': [nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan], 'GI': [nan, nan, nan], 'Accession': [nan, nan, nan], 'Symbol': ['phage_lambda_genome', 'phage_lambda_genome', 'phage_lambda_genome:low'], 'Protein_Product':

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Based on the gene annotation data, I see that 'ID' contains Illumina probe IDs (like ILMN_*) which match 
# the gene identifiers in the gene expression data, and 'Symbol' contains the gene symbols we need to map to.

# 2. Extract the two columns from gene annotation for mapping
# Get the SOFT and matrix files again (for consistency with previous steps)
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# Extract the gene annotation data - already done in previous step
# gene_annotation = get_gene_annotation(soft_file)

# Get the gene mapping dataframe using the library function
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='Symbol')
print(f"Gene mapping dataframe shape: {gene_mapping.shape}")
print("First 5 rows of gene mapping:")
print(gene_mapping.head())

# Extract gene expression data - already done in previous step
gene_expression = get_genetic_data(matrix_file)
print(f"Gene expression data shape: {gene_expression.shape}")

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_expression, gene_mapping)
print(f"Gene data after mapping shape: {gene_data.shape}")
print("First 10 gene symbols after mapping:")
print(gene_data.index[:10].tolist())

# Save the processed gene data
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Gene mapping dataframe shape: (44837, 2)
First 5 rows of gene mapping:
             ID                     Gene
0  ILMN_1343048      phage_lambda_genome
1  ILMN_1343049      phage_lambda_genome
2  ILMN_1343050  phage_lambda_genome:low
3  ILMN_1343052  phage_lambda_genome:low
4  ILMN_1343059                     thrB


Gene expression data shape: (47231, 50)
Gene data after mapping shape: (21372, 50)
First 10 gene symbols after mapping:
['A1BG', 'A1CF', 'A26C3', 'A2BP1', 'A2LD1', 'A2M', 'A2ML1', 'A3GALT2', 'A4GALT', 'A4GNT']


Gene expression data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE234398.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Attempt to load gene data and handle possible issues with normalization
try:
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    
    # Check if gene_data (from previous step) has any content
    if gene_data.shape[0] == 0:
        print("WARNING: Gene data is empty after normalization in previous step.")
        print("This appears to be miRNA data rather than gene expression data.")
        
        # Since gene_data is empty, set gene_available to False
        is_gene_available = False
        
        # Create an empty dataframe for metadata purposes
        empty_df = pd.DataFrame()
        
        # Log information about this dataset for future reference
        validate_and_save_cohort_info(
            is_final=True,
            cohort=cohort,
            info_path=json_path,
            is_gene_available=is_gene_available,
            is_trait_available=is_trait_available,
            is_biased=True,  # Consider it biased as we can't use it
            df=empty_df,
            note="Dataset appears to contain miRNA data rather than gene expression data. Gene symbols could not be normalized."
        )
        
        print("Dataset marked as unusable due to lack of valid gene expression data.")
    else:
        # If gene_data is not empty, proceed with normalization and linking
        print(f"Gene data shape before normalization: {gene_data.shape}")
        
        # Save the gene data we have, even if it's already normalized
        gene_data.to_csv(out_gene_data_file)
        print(f"Gene data saved to {out_gene_data_file}")
        
        # Attempt to link clinical and gene data
        if is_trait_available:
            # Load clinical data
            clinical_features = pd.read_csv(out_clinical_data_file, index_col=0)
            print(f"Loaded clinical data shape: {clinical_features.shape}")
            
            # Link the clinical and genetic data
            linked_data = geo_link_clinical_genetic_data(clinical_features, gene_data)
            print(f"Initial linked data shape: {linked_data.shape}")
            
            # Handle missing values
            linked_data = handle_missing_values(linked_data, trait)
            print(f"Linked data shape after handling missing values: {linked_data.shape}")
            
            if linked_data.shape[0] > 0:
                # Check for bias in trait and demographic features
                is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
                
                # Validate data quality and save cohort info
                is_usable = validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=is_biased,
                    df=linked_data,
                    note="Successfully processed gene expression data for coronary artery disease."
                )
                
                # Save the linked data if it's usable
                if is_usable:
                    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
                    linked_data.to_csv(out_data_file)
                    print(f"Linked data saved to {out_data_file}")
                else:
                    print("Data not usable for trait study - not saving final linked data.")
            else:
                print("After handling missing values, no samples remain.")
                validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=True,
                    df=pd.DataFrame(),
                    note="No valid samples after handling missing values."
                )
        else:
            # Cannot proceed with linking if trait data is missing
            validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=is_gene_available,
                is_trait_available=is_trait_available,
                is_biased=True,
                df=pd.DataFrame(),
                note="Cannot link data because trait information is not available."
            )
except Exception as e:
    print(f"Error in data processing: {e}")
    
    # Log the error and mark the dataset as unusable
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=False,  # Consider gene data unavailable if we had an error
        is_trait_available=is_trait_available,
        is_biased=True,  # Consider it biased as we can't use it
        df=pd.DataFrame(),  # Empty dataframe for metadata
        note=f"Error during normalization or linking: {str(e)}"
    )

Gene data shape before normalization: (21372, 50)


Gene data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE234398.csv
Loaded clinical data shape: (3, 50)
Initial linked data shape: (50, 21375)


Linked data shape after handling missing values: (50, 21375)
Quartiles for 'Coronary_artery_disease':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Coronary_artery_disease' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 60.0
  50% (Median): 67.0
  75%: 73.0
Min: 39.0
Max: 78.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 22 occurrences. This represents 44.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

Data not usable for trait study - not saving final linked data.
