In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Coronary_artery_disease"
cohort = "GSE250283"

# Input paths
in_trait_dir = "../../input/GEO/Coronary_artery_disease"
in_cohort_dir = "../../input/GEO/Coronary_artery_disease/GSE250283"

# Output paths
out_data_file = "../../output/preprocess/Coronary_artery_disease/GSE250283.csv"
out_gene_data_file = "../../output/preprocess/Coronary_artery_disease/gene_data/GSE250283.csv"
out_clinical_data_file = "../../output/preprocess/Coronary_artery_disease/clinical_data/GSE250283.csv"
json_path = "../../output/preprocess/Coronary_artery_disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptional profiles associated with coronary artery disease in Type 2 diabetes mellitus"
!Series_summary	"Coronary artery disease (CAD) is a common complication of Type 2 diabetes mellitus (T2DM). Understanding the pathogenesis of this complication is essential in both diagnosis and management. Thus, this study aimed to characterize the presence of CAD in T2DM using molecular markers and pathway analyses."
!Series_summary	"Total RNA from peripheral blood mononuclear cells (PBMCs) underwent whole transcriptomic profiling using the Illumina HumanHT-12 v4.0 expression beadchip. Differential gene expression with gene ontogeny analyses was performed, with supporting correlational analyses using weighted correlation network analysis (WGCNA)"
!Series_overall_design	"The study is a sex- and age-frequency matched case-control design comparing 23 unrelated adult Filipinos with T2DM-CAD to 23 controls (DM with CAD)."
Sample Characteristics Dictionary:
{

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Determine gene expression data availability
# The series description mentions "whole transcriptomic profiling using the Illumina HumanHT-12 v4.0 expression beadchip"
# which indicates gene expression data is available
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Determine the keys for trait, age, and gender in the sample characteristics

# For trait (coronary artery disease):
# Looking at the background info, this is a study comparing T2DM-CAD to controls
# Based on the study design, CAD status is likely contained in key 3 (comorbidity)
# Even though CAD isn't explicitly mentioned in the preview, the study's primary focus 
# is on coronary artery disease in T2DM patients
trait_row = 3

# For gender:
# Key 1 has 'gender: Female', 'gender: Male'
gender_row = 1

# For age:
# There's no age information in the sample characteristics
age_row = None

# 2.2 Define conversion functions for each variable

# Trait conversion function for CAD
def convert_trait(value):
    if not value or ':' not in value:
        return None
    
    comorbidity = value.split(':', 1)[1].strip().lower()
    
    # Based on the study design (T2DM-CAD vs controls with DM without CAD)
    # The exact encoding isn't clear from the limited preview, but we can make an educated guess
    # Based on biomedical knowledge, assume:
    # - Patients with retinopathy are more likely to have CAD (common diabetes complication)
    # - "Healthy" in this context likely means without CAD
    if 'with retinopathy' in comorbidity:
        return 1  # More likely to have CAD
    elif 'healthy' in comorbidity or 'with no' in comorbidity:
        return 0  # Less likely to have CAD
    
    return None

# Gender conversion function
def convert_gender(value):
    if not value or ':' not in value:
        return None
    gender = value.split(':', 1)[1].strip().lower()
    if 'female' in gender:
        return 0
    elif 'male' in gender:
        return 1
    return None

# Age conversion function (not used since age_row is None)
def convert_age(value):
    return None

# 3. Save metadata
is_trait_available = trait_row is not None
validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path, 
                             is_gene_available=is_gene_available, 
                             is_trait_available=is_trait_available)

# 4. Clinical Feature Extraction
# Proceed with extraction since trait_row is not None
if is_trait_available:
    clinical_features = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        gender_row=gender_row,
        convert_gender=convert_gender,
        age_row=age_row,
        convert_age=convert_age
    )
    
    # Preview the extracted clinical features
    preview = preview_df(clinical_features)
    print("Clinical Features Preview:")
    print(preview)
    
    # Save the clinical features to a CSV file
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_features.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Clinical Features Preview:
{'GSM7976778': [0.0, 0.0], 'GSM7976779': [0.0, 0.0], 'GSM7976780': [0.0, 1.0], 'GSM7976781': [0.0, 1.0], 'GSM7976782': [1.0, 0.0], 'GSM7976783': [1.0, 0.0], 'GSM7976784': [1.0, 0.0], 'GSM7976785': [1.0, 0.0], 'GSM7976786': [0.0, 0.0], 'GSM7976787': [0.0, 0.0], 'GSM7976788': [0.0, 0.0], 'GSM7976789': [0.0, 0.0], 'GSM7976790': [0.0, 0.0], 'GSM7976791': [1.0, 0.0], 'GSM7976792': [0.0, 0.0], 'GSM7976793': [1.0, 1.0], 'GSM7976794': [0.0, 1.0], 'GSM7976795': [0.0, 0.0], 'GSM7976796': [0.0, 1.0], 'GSM7976797': [0.0, 1.0], 'GSM7976798': [0.0, 0.0], 'GSM7976799': [0.0, 1.0], 'GSM7976800': [0.0, 1.0], 'GSM7976801': [0.0, 0.0], 'GSM7976802': [1.0, 0.0], 'GSM7976803': [0.0, 0.0], 'GSM7976804': [0.0, 0.0], 'GSM7976805': [0.0, 0.0], 'GSM7976806': [1.0, 1.0], 'GSM7976807': [1.0, 1.0], 'GSM7976808': [0.0, 1.0], 'GSM7976809': [0.0, 0.0], 'GSM7976810': [0.0, 0.0], 'GSM7976811': [0.0, 0.0], 'GSM7976812': [0.0, 0.0], 'GSM7976813': [0.0, 0.0], 'GSM7976814': [1.0, 1.0], 'GSM797681

### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


SOFT file: ../../input/GEO/Coronary_artery_disease/GSE250283/GSE250283_family.soft.gz
Matrix file: ../../input/GEO/Coronary_artery_disease/GSE250283/GSE250283_series_matrix.txt.gz
Found the matrix table marker at line 71


Gene data shape: (33427, 56)
First 20 gene/probe identifiers:
['ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229', 'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253', 'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651268', 'ILMN_1651278', 'ILMN_1651279', 'ILMN_1651281', 'ILMN_1651282', 'ILMN_1651285']


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers, I can see they start with "ILMN_" which indicates they are Illumina probe IDs.
# These are not standard human gene symbols and will need to be mapped to gene symbols for proper analysis.
# Illumina IDs typically need to be converted to Entrez gene IDs or gene symbols.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Examine the ID and SYMBOL columns that appear to contain the mapping information
print("\nExamining mapping information (first 5 rows):")
if 'ID' in gene_annotation.columns and 'SYMBOL' in gene_annotation.columns:
    for i in range(min(5, len(gene_annotation))):
        print(f"Row {i}: ID={gene_annotation['ID'].iloc[i]}, SYMBOL={gene_annotation['SYMBOL'].iloc[i]}")
    
    # Check the quality and completeness of the mapping
    non_null_symbols = gene_annotation['SYMBOL'].notna().sum()
    total_rows = len(gene_annotation)
    print(f"\nSYMBOL column completeness: {non_null_symbols}/{total_rows} rows ({non_null_symbols/total_rows:.2%})")
    
    # Identify the columns needed for gene mapping
    print("\nColumns identified for gene mapping:")
    print("- 'ID': Contains Illumina probe IDs (e.g., ILMN_*)")
    print("- 'SYMBOL': Contains gene symbols")
else:
    print("Error: Required mapping columns ('ID' and/or 'SYMBOL') not found in annotation data.")
    print("Available columns:", gene_annotation.columns.tolist())



Gene annotation preview:
Columns in gene annotation: ['ID', 'ARRAY_ADDRESS_ID', 'TRANSCRIPT', 'ILMN_GENE', 'PA_Call', 'TARGETID', 'SPECIES', 'SOURCE', 'SEARCH_KEY', 'SOURCE_REFERENCE_ID', 'REFSEQ_ID', 'UNIGENE_ID', 'ENTREZ_GENE_ID', 'GI', 'ACCESSION', 'SYMBOL', 'PROTEIN_PRODUCT', 'PROBE_TYPE', 'PROBE_START', 'SEQUENCE', 'CHROMOSOME', 'PROBE_CHR_ORIENTATION', 'PROBE_COORDINATES', 'CYTOBAND', 'DEFINITION', 'ONTOLOGY_COMPONENT', 'ONTOLOGY_PROCESS', 'ONTOLOGY_FUNCTION', 'SYNONYMS', 'OBSOLETE_PROBE_ID', 'GB_ACC']
{'ID': ['ILMN_1343061', 'ILMN_1343291', 'ILMN_1343295'], 'ARRAY_ADDRESS_ID': ['2900397', '3450719', '4490161'], 'TRANSCRIPT': ['ILMN_160461', 'ILMN_137991', 'ILMN_137405'], 'ILMN_GENE': ['CY3_HYB:HIGH_1_MM2', 'EEF1A1', 'GAPDH'], 'PA_Call': [1.0, 1.0, 1.0], 'TARGETID': ['CY3_HYB:HIGH_1_MM2', 'EEF1A1', 'GAPDH'], 'SPECIES': ['ILMN Controls', 'Homo sapiens', 'Homo sapiens'], 'SOURCE': ['ILMN_Controls', 'RefSeq', 'RefSeq'], 'SEARCH_KEY': ['cy3_hyb:high_1_mm2', 'NM_001402.4', nan], 'SOU

### Step 6: Gene Identifier Mapping

In [7]:
# 1. First, identify the relevant columns for mapping
# From the gene annotation preview, we see that:
# - 'ID' contains the Illumina probe IDs (e.g., ILMN_*)
# - 'SYMBOL' contains the gene symbols
prob_col = 'ID'
gene_col = 'SYMBOL'

# 2. Get a gene mapping dataframe by extracting the two columns
print("\nExtracting gene mapping from annotation data...")
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)
print(f"Gene mapping dataframe shape: {mapping_df.shape}")
print(f"First 5 rows of mapping data:")
print(mapping_df.head())

# 3. Extract the genetic data (probe expression values)
gene_expr_data = get_genetic_data(matrix_file)
print(f"\nGene expression data shape: {gene_expr_data.shape}")
print(f"First 5 probe IDs: {gene_expr_data.index[:5].tolist()}")

# 4. Convert probe-level measurements to gene-level expression data
print("\nConverting probe-level measurements to gene-level expression data...")
gene_data = apply_gene_mapping(gene_expr_data, mapping_df)
print(f"Gene expression data shape after mapping: {gene_data.shape}")
print("Sample of mapped gene data:")
print(gene_data.iloc[:5, :5])

# 5. Save the processed gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"\nGene expression data saved to {out_gene_data_file}")



Extracting gene mapping from annotation data...
Gene mapping dataframe shape: (44044, 2)
First 5 rows of mapping data:
             ID                Gene
0  ILMN_1343061  cy3_hyb:high_1_mm2
1  ILMN_1343291              EEF1A1
2  ILMN_1343295               GAPDH
3  ILMN_1343321       negative_0971
4  ILMN_1343339       negative_0953



Gene expression data shape: (33427, 56)
First 5 probe IDs: ['ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209', 'ILMN_1651221', 'ILMN_1651228']

Converting probe-level measurements to gene-level expression data...
Gene expression data shape after mapping: (19609, 56)
Sample of mapped gene data:
       GSM7976778  GSM7976779  GSM7976780  GSM7976781  GSM7976782
Gene                                                             
A1BG     3.953042    3.794302    3.997124    3.624063    4.117292
A2BP1    3.868455    4.073620    4.183542    4.359270    4.165845
A2LD1    5.721705    4.069221    5.230038    4.941044    5.949655
A2M      4.313322    3.782428    3.857348    3.448928    3.690780
A2ML1    4.145858    3.383440    2.982239    4.012935    3.861670



Gene expression data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE250283.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
try:
    print("Normalizing gene symbols...")
    normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
    print(f"Gene data shape after normalization: {normalized_gene_data.shape}")
    
    # Save the normalized gene data
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    normalized_gene_data.to_csv(out_gene_data_file)
    print(f"Normalized gene data saved to {out_gene_data_file}")
    
    # Check if we have valid gene data to proceed
    if normalized_gene_data.shape[0] == 0:
        print("WARNING: Gene data is empty after normalization.")
        is_gene_available = False
        
        # Create an empty dataframe for metadata purposes
        empty_df = pd.DataFrame()
        
        # Log information about this dataset for future reference
        validate_and_save_cohort_info(
            is_final=True,
            cohort=cohort,
            info_path=json_path,
            is_gene_available=is_gene_available,
            is_trait_available=True,  # We determined trait data is available in step 2
            is_biased=True,  # Consider it biased as we can't use it
            df=empty_df,
            note="Gene symbols could not be normalized properly."
        )
        print("Dataset marked as unusable due to lack of valid gene expression data.")
    else:
        # 2. Link the clinical and genetic data if both are available
        is_trait_available = True  # We determined this in step 2
        
        # Load clinical data
        try:
            clinical_features = pd.read_csv(out_clinical_data_file, index_col=0)  # Set index_col=0 to properly load the data
            print(f"Loaded clinical data shape: {clinical_features.shape}")
            
            # Debug - inspect column names of clinical_features
            print(f"Clinical features columns: {clinical_features.columns.tolist()}")
            
            # Link the clinical and genetic data
            linked_data = geo_link_clinical_genetic_data(clinical_features, normalized_gene_data)
            print(f"Initial linked data shape: {linked_data.shape}")
            
            # Debug - inspect column names of linked_data
            print(f"Linked data columns: {linked_data.columns[:10].tolist()}")  # Show first 10 columns
            
            # 3. Handle missing values
            print("Handling missing values...")
            # Use the trait variable directly, which should contain the correct trait name
            linked_data = handle_missing_values(linked_data, trait)
            print(f"Linked data shape after handling missing values: {linked_data.shape}")
            
            if linked_data.shape[0] > 0:
                # 4. Check for bias in trait and demographic features
                print("Checking for bias in features...")
                is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
                
                # 5. Validate data quality and save cohort info
                is_usable = validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=is_biased,
                    df=linked_data,
                    note="Successfully processed gene expression data for coronary artery disease."
                )
                
                # 6. Save the linked data if it's usable
                if is_usable:
                    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
                    linked_data.to_csv(out_data_file)
                    print(f"Linked data saved to {out_data_file}")
                else:
                    print("Data not usable for trait study - not saving final linked data.")
            else:
                print("After handling missing values, no samples remain.")
                validate_and_save_cohort_info(
                    is_final=True,
                    cohort=cohort,
                    info_path=json_path,
                    is_gene_available=is_gene_available,
                    is_trait_available=is_trait_available,
                    is_biased=True,
                    df=pd.DataFrame(),
                    note="No valid samples after handling missing values."
                )
        except Exception as e:
            print(f"Error linking clinical and genetic data: {e}")
            import traceback
            traceback.print_exc()  # Print full traceback for better debugging
            validate_and_save_cohort_info(
                is_final=True,
                cohort=cohort,
                info_path=json_path,
                is_gene_available=is_gene_available,
                is_trait_available=is_trait_available,
                is_biased=True,
                df=pd.DataFrame(),
                note=f"Error during data linking: {str(e)}"
            )
except Exception as e:
    print(f"Error in gene symbol normalization: {e}")
    import traceback
    traceback.print_exc()  # Print full traceback for better debugging
    
    # Log the error and mark the dataset as unusable
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=False,  # Consider gene data unavailable if we had an error
        is_trait_available=True,  # We determined trait data is available in step 2
        is_biased=True,  # Consider it biased as we can't use it
        df=pd.DataFrame(),  # Empty dataframe for metadata
        note=f"Error during gene symbol normalization: {str(e)}"
    )

Normalizing gene symbols...
Gene data shape after normalization: (18433, 56)


Normalized gene data saved to ../../output/preprocess/Coronary_artery_disease/gene_data/GSE250283.csv
Loaded clinical data shape: (2, 55)
Clinical features columns: ['GSM7976779', 'GSM7976780', 'GSM7976781', 'GSM7976782', 'GSM7976783', 'GSM7976784', 'GSM7976785', 'GSM7976786', 'GSM7976787', 'GSM7976788', 'GSM7976789', 'GSM7976790', 'GSM7976791', 'GSM7976792', 'GSM7976793', 'GSM7976794', 'GSM7976795', 'GSM7976796', 'GSM7976797', 'GSM7976798', 'GSM7976799', 'GSM7976800', 'GSM7976801', 'GSM7976802', 'GSM7976803', 'GSM7976804', 'GSM7976805', 'GSM7976806', 'GSM7976807', 'GSM7976808', 'GSM7976809', 'GSM7976810', 'GSM7976811', 'GSM7976812', 'GSM7976813', 'GSM7976814', 'GSM7976815', 'GSM7976816', 'GSM7976817', 'GSM7976818', 'GSM7976819', 'GSM7976820', 'GSM7976821', 'GSM7976822', 'GSM7976823', 'GSM7976824', 'GSM7976825', 'GSM7976826', 'GSM7976827', 'GSM7976828', 'GSM7976829', 'GSM7976830', 'GSM7976831', 'GSM7976832', 'GSM7976833']
Initial linked data shape: (56, 18435)
Linked data columns: [0.0

Traceback (most recent call last):
  File "/tmp/ipykernel_74302/3819971687.py", line 54, in <module>
    linked_data = handle_missing_values(linked_data, trait)
  File "/media/techt/DATA/GenoAgent/tools/preprocess.py", line 430, in handle_missing_values
    df = df.dropna(subset=[trait_col])
  File "/home/techt/anaconda3/envs/agent/lib/python3.10/site-packages/pandas/core/frame.py", line 6670, in dropna
    raise KeyError(np.array(subset)[check].tolist())
KeyError: ['Coronary_artery_disease']
