In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Intellectual_Disability"
cohort = "GSE285666"

# Input paths
in_trait_dir = "../../input/GEO/Intellectual_Disability"
in_cohort_dir = "../../input/GEO/Intellectual_Disability/GSE285666"

# Output paths
out_data_file = "../../output/preprocess/Intellectual_Disability/GSE285666.csv"
out_gene_data_file = "../../output/preprocess/Intellectual_Disability/gene_data/GSE285666.csv"
out_clinical_data_file = "../../output/preprocess/Intellectual_Disability/clinical_data/GSE285666.csv"
json_path = "../../output/preprocess/Intellectual_Disability/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Exon- and gene-Level transcriptional profiling in Lymphoblastoid Cell Lines (LCLs) from Williams syndrome patients and controls"
!Series_summary	"Williams syndrome (WS), characterized by positive sociality, provides a unique model for studying transcriptional networks underlying social dysfunction, relevant to disorders like autism spectrum disorder (ASD) and schizophrenia (SCHZ). In a cohort lymphoblastoid cell lines derived from 52 individuals (34 WS patients, 18 parental controls), genome-wide exon-level arrays identified a core set of differentially expressed genes (DEGs), with WS-deleted genes ranking among the top transcripts. Findings were validated by PCR, RNA-seq, and western blots."
!Series_summary	"Network analyses revealed perturbed actin cytoskeletal signaling in excitatory dendritic spines, alongside interactions in MAPK, IGF1-PI3K-AKT-mTOR/insulin, and synaptic actin pathways. These transcriptional networks show parallels to ASD and

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
from typing import Callable, Optional, Dict, Any

# 1. Gene Expression Data Availability
# Based on the background information, this is an "Exon and gene-Level transcriptional profiling" study
# using "Affymetrix Human Exon 1.0 ST arrays", so it contains gene expression data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Looking at the sample characteristics dictionary
# The trait is Intellectual Disability, which can be inferred from Williams Syndrome in row 0
trait_row = 0  
age_row = None  # No age information available in the sample characteristics
gender_row = None  # No gender information available in the sample characteristics

# 2.2 Data Type Conversion
def convert_trait(value):
    if pd.isna(value):
        return None
    value = value.strip().lower() if isinstance(value, str) else str(value).strip().lower()
    
    # Extract the value after the colon if present
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    # Williams syndrome patients typically have intellectual disability
    if "williams syndrome" in value or "patient" in value:
        return 1  # Intellectual disability present
    elif "unaffected" in value or "control" in value:
        return 0  # No intellectual disability
    else:
        return None  # Unknown or not applicable

def convert_age(value):
    # This function is not used as age data is not available
    return None

def convert_gender(value):
    # This function is not used as gender data is not available
    return None

# 3. Save Metadata - Initial Filtering
# Determine if trait data is available
is_trait_available = trait_row is not None

# Save the initial filtering metadata
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Check if trait data is available before proceeding
if trait_row is not None:
    # Load or access the actual clinical data
    # Assuming clinical_data should be loaded from a previous step or created from the sample characteristics
    clinical_data = pd.DataFrame({0: ['disease state: unaffected parental control', 'disease state: Williams syndrome patient']})
    
    # Use the geo_select_clinical_features function to extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,  # Using the provided trait variable (Intellectual_Disability)
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the selected clinical dataframe
    print("Preview of the selected clinical dataframe:")
    preview = preview_df(selected_clinical_df)
    print(preview)
    
    # Save the clinical dataframe to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of the selected clinical dataframe:
{0: [0.0]}
Clinical data saved to ../../output/preprocess/Intellectual_Disability/clinical_data/GSE285666.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the file paths for the SOFT file and matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Extract gene expression data from the matrix file
try:
    print("Extracting gene data from matrix file:")
    gene_data = get_genetic_data(matrix_file)
    if gene_data.empty:
        print("Extracted gene expression data is empty")
        is_gene_available = False
    else:
        print(f"Successfully extracted gene data with {len(gene_data.index)} rows")
        print("First 20 gene IDs:")
        print(gene_data.index[:20])
        is_gene_available = True
except Exception as e:
    print(f"Error extracting gene data: {e}")
    print("This dataset appears to have an empty or malformed gene expression matrix")
    is_gene_available = False

print(f"\nGene expression data available: {is_gene_available}")


Extracting gene data from matrix file:
Successfully extracted gene data with 22011 rows
First 20 gene IDs:
Index(['2315554', '2315633', '2315674', '2315739', '2315894', '2315918',
       '2315951', '2316218', '2316245', '2316379', '2316558', '2316605',
       '2316746', '2316905', '2316953', '2317246', '2317317', '2317434',
       '2317472', '2317512'],
      dtype='object', name='ID')

Gene expression data available: True


### Step 4: Gene Identifier Review

In [5]:
# Observe the gene identifiers in the gene expression data
# These appear to be probe IDs (numeric identifiers) rather than standard human gene symbols
# Human gene symbols typically follow naming conventions like BRCA1, TP53, etc.
# These numeric IDs (like 2315554) need to be mapped to human gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Extract gene annotation data from the SOFT file
print("Extracting gene annotation data from SOFT file...")
try:
    # Use the library function to extract gene annotation
    gene_annotation = get_gene_annotation(soft_file)
    print(f"Successfully extracted gene annotation data with {len(gene_annotation.index)} rows")
    
    # Preview the annotation DataFrame
    print("\nGene annotation preview (first few rows):")
    print(preview_df(gene_annotation))
    
    # Show column names to help identify which columns we need for mapping
    print("\nColumn names in gene annotation data:")
    print(gene_annotation.columns.tolist())
    
    # Check for relevant mapping columns
    if 'GB_ACC' in gene_annotation.columns:
        print("\nThe dataset contains GenBank accessions (GB_ACC) that could be used for gene mapping.")
        # Count non-null values in GB_ACC column
        non_null_count = gene_annotation['GB_ACC'].count()
        print(f"Number of rows with GenBank accessions: {non_null_count} out of {len(gene_annotation)}")
    
    if 'SPOT_ID' in gene_annotation.columns:
        print("\nThe dataset contains genomic regions (SPOT_ID) that could be used for location-based gene mapping.")
        print("Example SPOT_ID format:", gene_annotation['SPOT_ID'].iloc[0])
    
except Exception as e:
    print(f"Error processing gene annotation data: {e}")
    is_gene_available = False


Extracting gene annotation data from SOFT file...


Successfully extracted gene annotation data with 1461543 rows

Gene annotation preview (first few rows):
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'GB_LIST': ['NR_024005,NR_034090,NR_024004,AK093685', 'DQ786314', nan, nan, 'DQ786265'], 'SPOT_ID': ['chr1:11884-14409', 'chr1:14760-15198', 'chr1:19408-19712', 'chr1:25142-25532', 'chr1:27563-27813'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': ['11884', '14760', '19408', '25142', '27563'], 'RANGE_STOP': ['14409', '15198', '19712', '25532', '27813'], 'total_probes': ['20', '8', '4', '4', '4'], 'gene_assignment': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 9 // 15q26.3 // 100288486 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the appropriate columns for gene mapping
print("Identifying gene mapping columns...")
# The 'ID' column in gene_annotation matches the index in gene_data (which are probe IDs)
# The 'gene_assignment' column contains gene symbols and other information that needs parsing

# Function to extract gene symbols from the gene_assignment field
def extract_gene_symbols(text):
    if pd.isna(text) or text == '---':
        return []
    
    genes = []
    # Parse gene information from format: "ACCESSION // SYMBOL // DESCRIPTION"
    parts = text.split('///')
    for part in parts:
        if '//' in part:
            elements = [e.strip() for e in part.split('//')]
            if len(elements) >= 2 and elements[1] and elements[1] != '---':
                genes.append(elements[1])
    
    return genes

# 2. Create a mapping dataframe
mapping_data = gene_annotation[['ID', 'gene_assignment']].copy()
mapping_data = mapping_data.rename(columns={'gene_assignment': 'Gene'})

# Apply function to extract gene symbols from gene_assignment
print("Extracting gene symbols from annotation data...")
mapping_data['Gene'] = mapping_data['Gene'].apply(extract_gene_symbols)

# Ensure ID column is string type
mapping_data['ID'] = mapping_data['ID'].astype(str)
gene_data.index = gene_data.index.astype(str)

# Debug: Check how many IDs have at least one gene symbol
non_empty_symbols = mapping_data[mapping_data['Gene'].apply(len) > 0]
print(f"Found {len(non_empty_symbols)} probes with at least one gene symbol")

# Remove rows with empty gene lists
mapping_data = mapping_data[mapping_data['Gene'].apply(len) > 0]

print(f"Created gene mapping with {len(mapping_data)} rows")
print("Sample of gene mapping:")
print(mapping_data.head())

# Debug: Check overlap between probe IDs in mapping and expression data
probe_ids_in_mapping = set(mapping_data['ID'].values)
probe_ids_in_expression = set(gene_data.index)
overlap = probe_ids_in_mapping.intersection(probe_ids_in_expression)
print(f"Number of probe IDs in mapping: {len(probe_ids_in_mapping)}")
print(f"Number of probe IDs in expression data: {len(probe_ids_in_expression)}")
print(f"Number of overlapping probe IDs: {len(overlap)}")

# Debug: Print some sample overlapping IDs
sample_overlap = list(overlap)[:5]
print(f"Sample overlapping IDs: {sample_overlap}")

# Debug: Check if these IDs actually exist in both dataframes
for id in sample_overlap:
    in_mapping = id in mapping_data['ID'].values
    in_expression = id in gene_data.index
    print(f"ID {id}: In mapping={in_mapping}, In expression={in_expression}")

# Make a copy of gene_data to ensure it's not modified before mapping
gene_data_orig = gene_data.copy()

# 3. Apply gene mapping to convert from probe-level to gene-level expression
print("\nApplying gene mapping to convert probe-level to gene-level expression...")
gene_data = apply_gene_mapping(gene_data_orig, mapping_data)

print(f"After mapping: Created gene expression data with {gene_data.shape[0]} genes and {gene_data.shape[1]} samples")
if gene_data.shape[0] > 0:
    print("First few gene symbols:")
    print(gene_data.index[:5])
else:
    print("WARNING: No genes were mapped! Using the first approach but with simplified mapping...")
    
    # Simplify the mapping to focus on direct ID to gene relationship
    simpler_mapping = mapping_data.explode('Gene')
    simpler_mapping = simpler_mapping.dropna(subset=['Gene'])
    simpler_mapping = simpler_mapping[simpler_mapping['Gene'] != '']
    
    print(f"Created simplified mapping with {len(simpler_mapping)} entries")
    print("Sample of simplified mapping:")
    print(simpler_mapping.head())
    
    # Try mapping again with simplified approach
    gene_data = apply_gene_mapping(gene_data_orig, simpler_mapping)
    print(f"After simplified mapping: Gene expression data has {gene_data.shape[0]} genes")
    if gene_data.shape[0] > 0:
        print("First few gene symbols:")
        print(gene_data.index[:5])

# Normalize gene symbols only if we have genes
if gene_data.shape[0] > 0:
    print("\nNormalizing gene symbols...")
    gene_data = normalize_gene_symbols_in_index(gene_data)
    print(f"After normalization: Gene expression data has {gene_data.shape[0]} genes")

# Save the gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Identifying gene mapping columns...
Extracting gene symbols from annotation data...


Found 33475 probes with at least one gene symbol


Created gene mapping with 33475 rows
Sample of gene mapping:
         ID                                               Gene
0   2315100               [DDX11L2, DDX11L9, DDX11L2, DDX11L2]
10  2315125  [OR4F17, OR4F4, OR4F5, OR4F17, OR4F4, OR4F5, O...
14  2315147         [LOC100288692, LOC100289383, LOC100506283]
15  2315160                 [FLJ45445, FLJ45445, LOC100133161]
16  2315163  [LOC100132062, LOC728417, LOC100133331, NCRNA0...
Number of probe IDs in mapping: 33475
Number of probe IDs in expression data: 22011
Number of overlapping probe IDs: 17634
Sample overlapping IDs: ['2493746', '2657228', '3817222', '3347658', '2773872']
ID 2493746: In mapping=True, In expression=True
ID 2657228: In mapping=True, In expression=True
ID 3817222: In mapping=True, In expression=True
ID 3347658: In mapping=True, In expression=True
ID 2773872: In mapping=True, In expression=True

Applying gene mapping to convert probe-level to gene-level expression...
After mapping: Created gene expression data 

After simplified mapping: Gene expression data has 18609 genes
First few gene symbols:
Index(['A1BG', 'A1BG-AS', 'A1CF', 'A2LD1', 'A2M'], dtype='object', name='Gene')

Normalizing gene symbols...
After normalization: Gene expression data has 18306 genes


Gene expression data saved to ../../output/preprocess/Intellectual_Disability/gene_data/GSE285666.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
print("Normalizing gene symbols in the expression data...")
try:
    # If previous steps have already loaded gene_data
    normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    
    # Save normalized gene data
    normalized_gene_data.to_csv(out_gene_data_file)
    print(f"Normalized gene data saved to {out_gene_data_file}")
    print(f"Normalized gene data shape: {normalized_gene_data.shape}")
except Exception as e:
    print(f"Error normalizing gene data: {e}")
    is_gene_available = False

# 2. Load clinical data from file and link with genetic data
print("\nLoading clinical data from file...")
try:
    # Load the previously saved clinical data
    selected_clinical_df = pd.read_csv(out_clinical_data_file)
    
    # Set is_trait_available based on whether the clinical data contains the trait
    is_trait_available = True
    print(f"Clinical data shape: {selected_clinical_df.shape}")
    
    # Ensure we have the proper file paths for Step 2 if needed
    soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
    
    # Get the raw clinical data again to work with proper sample IDs
    background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
    clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
    _, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
    
    # Run the clinical feature extraction again
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    print(f"Reprocessed clinical data shape: {selected_clinical_df.shape}")
    
    print("\nLinking clinical and genetic data...")
    # Use the proper helper function to link clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, normalized_gene_data)
    print(f"Linked data shape: {linked_data.shape}")
    
    # Check if we have a valid data shape before proceeding
    if linked_data.shape[0] == 0 or linked_data.shape[1] <= 1:
        raise ValueError("Linked data has insufficient dimensions for analysis")
    
    # 3. Handle missing values systematically
    print("\nHandling missing values...")
    linked_data = handle_missing_values(linked_data, trait)
    print(f"After handling missing values, data shape: {linked_data.shape}")
    
    # 4. Determine whether features are biased
    print("\nChecking for bias in features...")
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 5. Final validation and save metadata
    print("\nPerforming final validation...")
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=is_gene_available,
        is_trait_available=is_trait_available,
        is_biased=is_biased,
        df=linked_data,
        note="Williams Syndrome patients as intellectual disability cases, with unaffected parental controls as reference group."
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        
        # Save linked data
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print(f"Dataset not usable for {trait} association studies. Data not saved.")

except Exception as e:
    print(f"Error in data linking or processing: {str(e)}")
    # Create a minimal dataframe for validation purposes
    linked_data = pd.DataFrame({trait: [0, 1]})
    
    # Perform final validation with appropriate flags
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=is_gene_available, 
        is_trait_available=is_trait_available,
        is_biased=True,  # Not relevant since data isn't usable
        df=linked_data,
        note="Failed to link gene and clinical data: " + str(e)
    )
    print(f"Dataset usability: {is_usable}")

Normalizing gene symbols in the expression data...


Normalized gene data saved to ../../output/preprocess/Intellectual_Disability/gene_data/GSE285666.csv
Normalized gene data shape: (18306, 52)

Loading clinical data from file...
Clinical data shape: (1, 1)
Reprocessed clinical data shape: (1, 52)

Linking clinical and genetic data...
Linked data shape: (52, 18307)

Handling missing values...


After handling missing values, data shape: (52, 18307)

Checking for bias in features...
For the feature 'Intellectual_Disability', the least common label is '0.0' with 18 occurrences. This represents 34.62% of the dataset.
The distribution of the feature 'Intellectual_Disability' in this dataset is fine.


Performing final validation...


Linked data saved to ../../output/preprocess/Intellectual_Disability/GSE285666.csv
