In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Coronary_artery_disease"
cohort = "GSE156357"

# Input paths
in_trait_dir = "../../input/GEO/Coronary_artery_disease"
in_cohort_dir = "../../input/GEO/Coronary_artery_disease/GSE156357"

# Output paths
out_data_file = "../../output/preprocess/Coronary_artery_disease/GSE156357.csv"
out_gene_data_file = "../../output/preprocess/Coronary_artery_disease/gene_data/GSE156357.csv"
out_clinical_data_file = "../../output/preprocess/Coronary_artery_disease/clinical_data/GSE156357.csv"
json_path = "../../output/preprocess/Coronary_artery_disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Supplementation with Probiotic Lactobacillus plantarum 299v  in Men With Stable Coronary Artery Disease Suppresses Systemic Inflammation"
!Series_summary	"Recent clinical trials demonstrate the efficacy of treatment strategies to reduce cardiovascular events in patients with coronary artery disease (CAD) that focus in reducing inflammatory signaling. Emerging data implicate the gut microbiota as a critical regulator of systemic inflammation. We recently demonstrated that supplementation with Lactobacillus plantarum 299v (Lp299v) improved vascular endothelial function in men with stable CAD. In this study we investigated whether the favorable effects of Lp299v on vascular health are due in part to coordinated suppression of systemic inflammation. We applied pre- and post-Lp299v supplementation plasma from these patients to peripheral blood mononuclear cells of a healthy donor to determine the transcriptional response to this intervention."
!Series_

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# The dataset seems to be gene expression data as the series title and design
# indicate transcriptome analysis was performed. It's not purely miRNA or methylation data.
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# For trait (CAD), there's no explicit row but we know all subjects have CAD based on background
trait_row = 1  # We'll use the treatment row to extract patient identifiers and assign CAD status

# No explicit age information is provided
age_row = None

# No explicit gender information is provided, though the background indicates all subjects are men
gender_row = None

# 2.2 Data Type Conversion
def convert_trait(value):
    """Convert treatment text to CAD status (binary)"""
    if value is None:
        return None
    # All participants have CAD according to background info
    # We'll extract if this is PRE or POST treatment
    if isinstance(value, str) and ":" in value:
        value = value.split(":", 1)[1].strip()
        if "PRE" in value:
            return 1  # CAD patients before treatment
        elif "POST" in value:
            return 0  # CAD patients after treatment
    return None

def convert_age(value):
    """Convert age value to numeric format"""
    # Age data not available
    return None

def convert_gender(value):
    """Convert gender value to binary format (0 for female, 1 for male)"""
    # Gender data not available directly, but background says all are men
    return None

# 3. Save Metadata
# Initial validation and filtering
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical data
    print("Preview of selected clinical features:")
    preview = preview_df(selected_clinical_df)
    print(preview)
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of selected clinical features:
{'GSM4729476': [1.0], 'GSM4729477': [0.0], 'GSM4729478': [1.0], 'GSM4729479': [0.0], 'GSM4729480': [1.0], 'GSM4729481': [0.0], 'GSM4729482': [1.0], 'GSM4729483': [0.0], 'GSM4729484': [1.0], 'GSM4729485': [0.0], 'GSM4729486': [1.0], 'GSM4729487': [0.0], 'GSM4729488': [1.0], 'GSM4729489': [0.0], 'GSM4729490': [1.0], 'GSM4729491': [0.0], 'GSM4729492': [1.0], 'GSM4729493': [0.0], 'GSM4729494': [1.0], 'GSM4729495': [0.0], 'GSM4729496': [1.0], 'GSM4729497': [0.0], 'GSM4729498': [1.0], 'GSM4729499': [0.0], 'GSM4729500': [1.0], 'GSM4729501': [0.0], 'GSM4729502': [1.0], 'GSM4729503': [0.0], 'GSM4729504': [1.0], 'GSM4729505': [0.0], 'GSM4729506': [1.0], 'GSM4729507': [0.0], 'GSM4729508': [1.0], 'GSM4729509': [0.0], 'GSM4729510': [1.0], 'GSM4729511': [0.0], 'GSM4729512': [1.0], 'GSM4729513': [0.0]}
Clinical data saved to ../../output/preprocess/Coronary_artery_disease/clinical_data/GSE156357.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


SOFT file: ../../input/GEO/Coronary_artery_disease/GSE156357/GSE156357_family.soft.gz
Matrix file: ../../input/GEO/Coronary_artery_disease/GSE156357/GSE156357_series_matrix.txt.gz
Found the matrix table marker at line 71
Gene data shape: (54675, 38)
First 20 gene/probe identifiers:
['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at', '1494_f_at', '1552256_a_at', '1552257_a_at', '1552258_at', '1552261_at', '1552263_at', '1552264_a_at', '1552266_at']


### Step 4: Gene Identifier Review

In [5]:
# The gene identifiers shown in the previous output appear to be Affymetrix probe IDs
# (like "1007_s_at", "1053_at", etc.), not standard human gene symbols.
# These are microarray probe identifiers that need to be mapped to actual gene symbols
# for biological interpretation.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Examine the ID and Gene Symbol columns that appear to contain the mapping information
print("\nExamining mapping information (first 5 rows):")
if 'ID' in gene_annotation.columns and 'Gene Symbol' in gene_annotation.columns:
    for i in range(min(5, len(gene_annotation))):
        print(f"Row {i}: ID={gene_annotation['ID'].iloc[i]}, Gene Symbol={gene_annotation['Gene Symbol'].iloc[i]}")
    
    # Check the quality and completeness of the mapping
    non_null_symbols = gene_annotation['Gene Symbol'].notna().sum()
    total_rows = len(gene_annotation)
    print(f"\nGene Symbol column completeness: {non_null_symbols}/{total_rows} rows ({non_null_symbols/total_rows:.2%})")
    
    # Identify the columns needed for gene mapping
    print("\nColumns identified for gene mapping:")
    print("- 'ID': Contains probe IDs (e.g., 1007_s_at)")
    print("- 'Gene Symbol': Contains gene symbols (e.g., DDR1 /// MIR4640)")
else:
    print("Error: Required mapping columns ('ID' and/or 'Gene Symbol') not found in annotation data.")
    print("Available columns:", gene_annotation.columns.tolist())



Gene annotation preview:
Columns in gene annotation: ['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date', 'Sequence Type', 'Sequence Source', 'Target Description', 'Representative Public ID', 'Gene Title', 'Gene Symbol', 'ENTREZ_GENE_ID', 'RefSeq Transcript ID', 'Gene Ontology Biological Process', 'Gene Ontology Cellular Component', 'Gene Ontology Molecular Function']
{'ID': ['1007_s_at', '1053_at', '117_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757'], 'SPOT_ID': [nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human 

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns contain probe IDs and gene symbols
probe_id_col = 'ID'
gene_symbol_col = 'Gene Symbol'

# 2. Create the gene mapping dataframe using the get_gene_mapping function
gene_mapping = get_gene_mapping(gene_annotation, probe_id_col, gene_symbol_col)

print(f"Gene mapping dataframe created with shape: {gene_mapping.shape}")
print("First 5 rows of gene mapping:")
print(gene_mapping.head())

# 3. Apply the gene mapping to convert probe-level measurements to gene expression
# First get the gene expression data from the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_expression = get_genetic_data(matrix_file)

print(f"Original gene expression data shape: {gene_expression.shape}")

# Apply the gene mapping to convert probe-level data to gene-level data
gene_data = apply_gene_mapping(gene_expression, gene_mapping)

print(f"Gene expression data after mapping: {gene_data.shape}")
print("First 10 genes and their expression values:")
print(gene_data.head(10))

# Check how many genes we have after mapping
print(f"Total number of genes after mapping: {len(gene_data)}")


Gene mapping dataframe created with shape: (45782, 2)
First 5 rows of gene mapping:
          ID              Gene
0  1007_s_at  DDR1 /// MIR4640
1    1053_at              RFC2
2     117_at             HSPA6
3     121_at              PAX8
4  1255_g_at            GUCA1A


Original gene expression data shape: (54675, 38)


Gene expression data after mapping: (21278, 38)
First 10 genes and their expression values:
          GSM4729476  GSM4729477  GSM4729478  GSM4729479  GSM4729480  \
Gene                                                                   
A1BG         4.64853     5.00640     5.12824     4.74476     4.74095   
A1BG-AS1     3.57276     3.59143     3.71808     3.17932     2.85107   
A1CF         6.65677     6.98451     7.18984     6.96510     7.10847   
A2M          7.03923     6.91868     6.84097     6.69868     8.12533   
A2M-AS1      7.49411     6.40790     6.19496     7.09980     5.83301   
A2ML1        5.09101     5.89952     6.04903     5.14808     5.42730   
A2MP1        3.50384     3.96831     4.30658     3.37655     3.83151   
A4GALT       3.61928     3.91700     4.24931     4.51080     4.28586   
A4GNT        3.59239     3.70216     3.78352     3.44427     3.99832   
AA06         2.67766     2.93053     3.37049     3.06110     3.04772   

          GSM4729481  GSM4729482  GSM472948

### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
try:
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    
    # Check if gene_data (from previous step) has any content
    if gene_data.shape[0] == 0:
        print("WARNING: Gene data is empty after mapping in previous step.")
        
        # Since gene_data is empty, set gene_available to False
        is_gene_available = False
        
        # Create an empty dataframe for metadata purposes
        empty_df = pd.DataFrame()
        
        # Log information about this dataset for future reference
        validate_and_save_cohort_info(
            is_final=True,
            cohort=cohort,
            info_path=json_path,
            is_gene_available=is_gene_available,
            is_trait_available=True,  # We determined trait data is available in step 2
            is_biased=True,  # Consider it biased as we can't use it
            df=empty_df,
            note="Gene symbols could not be mapped properly. No valid gene expression data available."
        )
        
        print("Dataset marked as unusable due to lack of valid gene expression data.")
    else:
        # If gene_data is not empty, proceed with normalization and linking
        print(f"Gene data shape before normalization: {gene_data.shape}")
        
        # Normalize gene symbols using NCBI Gene database
        normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
        print(f"Gene data shape after normalization: {normalized_gene_data.shape}")
        
        # Save the normalized gene data
        normalized_gene_data.to_csv(out_gene_data_file)
        print(f"Normalized gene data saved to {out_gene_data_file}")
        
        # Attempt to link clinical and gene data
        # Load clinical data
        clinical_features = pd.read_csv(out_clinical_data_file)
        print(f"Loaded clinical data shape: {clinical_features.shape}")
        print(f"Clinical data columns: {clinical_features.columns.tolist()}")
        
        # Convert the index column to actual index if needed
        if 'Unnamed: 0' in clinical_features.columns:
            clinical_features.set_index('Unnamed: 0', inplace=True)
        
        # Link the clinical and genetic data
        linked_data = geo_link_clinical_genetic_data(clinical_features, normalized_gene_data)
        print(f"Initial linked data shape: {linked_data.shape}")
        print(f"Linked data columns (first 5): {linked_data.columns[:5].tolist()}")
        
        # Check which columns are available for the trait
        if trait in linked_data.columns:
            trait_column = trait
        else:
            # If trait name isn't a column, check if the first column is the trait
            # (based on how geo_select_clinical_features returns data)
            first_col = linked_data.columns[0]
            print(f"Using first column as trait: {first_col}")
            trait_column = first_col
        
        # Handle missing values with the correct trait column
        linked_data = handle_missing_values(linked_data, trait_column)
        print(f"Linked data shape after handling missing values: {linked_data.shape}")
        
        if linked_data.shape[0] > 0:
            # Check for bias in trait and demographic features
            # Implement judge_and_remove_biased_features function
            def judge_and_remove_biased_features(df, trait_col):
                """Evaluate and remove biased features from the dataset."""
                trait_type = 'binary' if len(df[trait_col].unique()) <= 2 else 'continuous'
                if trait_type == "binary":
                    trait_biased = judge_binary_variable_biased(df, trait_col)
                else:
                    trait_biased = judge_continuous_variable_biased(df, trait_col)
                
                if trait_biased:
                    print(f"The distribution of the feature \'{trait_col}\' in this dataset is severely biased.\n")
                else:
                    print(f"The distribution of the feature \'{trait_col}\' in this dataset is fine.\n")
                
                # Check age if present
                if "Age" in df.columns:
                    age_biased = judge_continuous_variable_biased(df, 'Age')
                    if age_biased:
                        print(f"The distribution of the feature \'Age\' in this dataset is severely biased.\n")
                        df = df.drop(columns='Age')
                    else:
                        print(f"The distribution of the feature \'Age\' in this dataset is fine.\n")
                
                # Check gender if present
                if "Gender" in df.columns:
                    gender_biased = judge_binary_variable_biased(df, 'Gender')
                    if gender_biased:
                        print(f"The distribution of the feature \'Gender\' in this dataset is severely biased.\n")
                        df = df.drop(columns='Gender')
                    else:
                        print(f"The distribution of the feature \'Gender\' in this dataset is fine.\n")
                
                return trait_biased, df
            
            is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait_column)
            
            # Validate data quality and save cohort info
            is_usable = validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=True,
                is_trait_available=True,
                is_biased=is_biased,
                df=linked_data,
                note="Successfully processed gene expression data for coronary artery disease."
            )
            
            # Save the linked data if it's usable
            if is_usable:
                os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
                linked_data.to_csv(out_data_file)
                print(f"Linked data saved to {out_data_file}")
            else:
                print("Data not usable for trait study - not saving final linked data.")
        else:
            print("After handling missing values, no samples remain.")
            validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=True,
                is_trait_available=True,
                is_biased=True,
                df=pd.DataFrame(),
                note="No valid samples after handling missing values."
            )
except Exception as e:
    import traceback
    print(f"Error in data processing: {e}")
    print(traceback.format_exc())
    
    # Log the error and mark the dataset as unusable
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=False,  # Consider gene data unavailable if we had an error
        is_trait_available=True,
        is_biased=True,  # Consider it biased as we can't use it
        df=pd.DataFrame(),  # Empty dataframe for metadata
        note=f"Error during normalization or linking: {str(e)}"
    )

Gene data shape before normalization: (21278, 38)


Gene data shape after normalization: (19845, 38)


Normalized gene data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE156357.csv
Loaded clinical data shape: (1, 38)
Clinical data columns: ['GSM4729476', 'GSM4729477', 'GSM4729478', 'GSM4729479', 'GSM4729480', 'GSM4729481', 'GSM4729482', 'GSM4729483', 'GSM4729484', 'GSM4729485', 'GSM4729486', 'GSM4729487', 'GSM4729488', 'GSM4729489', 'GSM4729490', 'GSM4729491', 'GSM4729492', 'GSM4729493', 'GSM4729494', 'GSM4729495', 'GSM4729496', 'GSM4729497', 'GSM4729498', 'GSM4729499', 'GSM4729500', 'GSM4729501', 'GSM4729502', 'GSM4729503', 'GSM4729504', 'GSM4729505', 'GSM4729506', 'GSM4729507', 'GSM4729508', 'GSM4729509', 'GSM4729510', 'GSM4729511', 'GSM4729512', 'GSM4729513']
Initial linked data shape: (38, 19846)
Linked data columns (first 5): [0, 'A1BG', 'A1BG-AS1', 'A1CF', 'A2M']
Using first column as trait: 0


Linked data shape after handling missing values: (38, 19846)
For the feature '0', the least common label is '1.0' with 19 occurrences. This represents 50.00% of the dataset.
The distribution of the feature '0' in this dataset is fine.



Linked data saved to ../../output/preprocess/Coronary_artery_disease/GSE156357.csv
