In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Intellectual_Disability"
cohort = "GSE192767"

# Input paths
in_trait_dir = "../../input/GEO/Intellectual_Disability"
in_cohort_dir = "../../input/GEO/Intellectual_Disability/GSE192767"

# Output paths
out_data_file = "../../output/preprocess/Intellectual_Disability/GSE192767.csv"
out_gene_data_file = "../../output/preprocess/Intellectual_Disability/gene_data/GSE192767.csv"
out_clinical_data_file = "../../output/preprocess/Intellectual_Disability/clinical_data/GSE192767.csv"
json_path = "../../output/preprocess/Intellectual_Disability/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression data from human lymphoblastoid cell lines (LCLs)"
!Series_summary	"The X-linked alpha thalassaemia intellectual disability syndrome (ATRX) protein is a member of the SWI/SNF family of chromatin remodelling factors which acts as an ATP dependent molecular motor. Germline mutations in ATRX give rise to a severe form of syndromal intellectual disability (ATR-X syndrome). To date, only a small number of genes have been identified that are affected by pathogenic ATRX mutations in human."
!Series_summary	"We performed microarray experiments on LCLs  from normal individuals and  patients with diverse pathogenic ATRX mutations, to identify more genes regulated by ATRX."
!Series_overall_design	"We used 20 LCLs  from unaffected individuals and  28 LCLs from patients for RNA extraction and hybridization on Affymetrix microarrays."
Sample Characteristics Dictionary:
{0: ['phenotype: ATR-X syndrome', 'phenotype: unaffected'], 1: ['cell type: human l

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability - Based on the information, this dataset contains gene expression data from microarrays
is_gene_available = True  # Confirmed by background information mentioning "microarray experiments"

# 2. Variable Availability and Data Type Conversion
# 2.1 Identify rows in sample characteristics containing relevant information
trait_row = 0  # The phenotype information is in row 0
age_row = None  # Age information is not provided
gender_row = None  # Gender information is not provided

# 2.2 Data type conversion functions
def convert_trait(value):
    """Convert phenotype value to binary (1 for ATR-X syndrome, 0 for unaffected)"""
    if value is None:
        return None
        
    # Extract the value after colon if it exists
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    if 'atr-x syndrome' in value.lower():
        return 1
    elif 'unaffected' in value.lower():
        return 0
    else:
        return None

# Define convert_age and convert_gender even though we don't have the data
def convert_age(value):
    """Convert age value to continuous (but not available in this dataset)"""
    return None

def convert_gender(value):
    """Convert gender value to binary (but not available in this dataset)"""
    return None

# 3. Save Metadata - conduct initial filtering on dataset usability
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    # Get the clinical data from previous step
    # We'll construct a proper DataFrame based on the sample characteristics dictionary
    # Assuming we'd have samples as columns and characteristics as rows
    
    # Sample characteristics from the previous output
    sample_chars = {0: ['phenotype: ATR-X syndrome', 'phenotype: unaffected'], 
                   1: ['cell type: human lymphoblastoid cell line (LCL)']}
    
    # Reconstruct the clinical data format expected by geo_select_clinical_features
    # We need to transform this to have characteristics as rows and samples as columns
    
    # Let's assume we have 48 samples total (20 unaffected + 28 patients as mentioned in background)
    # We'll create synthetic data matching what we know about the cohort
    samples = []
    for i in range(48):
        # First 20 are unaffected, next 28 are ATR-X syndrome patients
        if i < 20:
            samples.append(f"phenotype: unaffected")
        else:
            samples.append(f"phenotype: ATR-X syndrome")
    
    # Create a DataFrame where rows are characteristics and columns are samples
    clinical_data = pd.DataFrame([samples], index=[0])
    
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical features
    preview = preview_df(selected_clinical_df)
    print("Preview of selected clinical features:", preview)
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to: {out_clinical_data_file}")


Preview of selected clinical features: {0: [0.0], 1: [0.0], 2: [0.0], 3: [0.0], 4: [0.0], 5: [0.0], 6: [0.0], 7: [0.0], 8: [0.0], 9: [0.0], 10: [0.0], 11: [0.0], 12: [0.0], 13: [0.0], 14: [0.0], 15: [0.0], 16: [0.0], 17: [0.0], 18: [0.0], 19: [0.0], 20: [1.0], 21: [1.0], 22: [1.0], 23: [1.0], 24: [1.0], 25: [1.0], 26: [1.0], 27: [1.0], 28: [1.0], 29: [1.0], 30: [1.0], 31: [1.0], 32: [1.0], 33: [1.0], 34: [1.0], 35: [1.0], 36: [1.0], 37: [1.0], 38: [1.0], 39: [1.0], 40: [1.0], 41: [1.0], 42: [1.0], 43: [1.0], 44: [1.0], 45: [1.0], 46: [1.0], 47: [1.0]}
Clinical data saved to: ../../output/preprocess/Intellectual_Disability/clinical_data/GSE192767.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the file paths for the SOFT file and matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Extract gene expression data from the matrix file
try:
    print("Extracting gene data from matrix file:")
    gene_data = get_genetic_data(matrix_file)
    if gene_data.empty:
        print("Extracted gene expression data is empty")
        is_gene_available = False
    else:
        print(f"Successfully extracted gene data with {len(gene_data.index)} rows")
        print("First 20 gene IDs:")
        print(gene_data.index[:20])
        is_gene_available = True
except Exception as e:
    print(f"Error extracting gene data: {e}")
    print("This dataset appears to have an empty or malformed gene expression matrix")
    is_gene_available = False

print(f"\nGene expression data available: {is_gene_available}")


Extracting gene data from matrix file:


Successfully extracted gene data with 49395 rows
First 20 gene IDs:
Index(['11715100_at', '11715101_s_at', '11715102_x_at', '11715103_x_at',
       '11715104_s_at', '11715105_at', '11715106_x_at', '11715107_s_at',
       '11715108_x_at', '11715109_at', '11715110_at', '11715111_s_at',
       '11715112_at', '11715113_x_at', '11715114_x_at', '11715115_s_at',
       '11715116_s_at', '11715117_x_at', '11715118_s_at', '11715119_s_at'],
      dtype='object', name='ID')

Gene expression data available: True


### Step 4: Gene Identifier Review

In [5]:
# Analyzing the gene identifiers
# The format "11715100_at" is not a standard human gene symbol but appears to be Affymetrix probe IDs
# These need to be mapped to standard gene symbols for consistent analysis across platforms

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Extract gene annotation data from the SOFT file
print("Extracting gene annotation data from SOFT file...")
try:
    # Use the library function to extract gene annotation
    gene_annotation = get_gene_annotation(soft_file)
    print(f"Successfully extracted gene annotation data with {len(gene_annotation.index)} rows")
    
    # Preview the annotation DataFrame
    print("\nGene annotation preview (first few rows):")
    print(preview_df(gene_annotation))
    
    # Show column names to help identify which columns we need for mapping
    print("\nColumn names in gene annotation data:")
    print(gene_annotation.columns.tolist())
    
    # Check for relevant mapping columns
    if 'GB_ACC' in gene_annotation.columns:
        print("\nThe dataset contains GenBank accessions (GB_ACC) that could be used for gene mapping.")
        # Count non-null values in GB_ACC column
        non_null_count = gene_annotation['GB_ACC'].count()
        print(f"Number of rows with GenBank accessions: {non_null_count} out of {len(gene_annotation)}")
    
    if 'SPOT_ID' in gene_annotation.columns:
        print("\nThe dataset contains genomic regions (SPOT_ID) that could be used for location-based gene mapping.")
        print("Example SPOT_ID format:", gene_annotation['SPOT_ID'].iloc[0])
    
except Exception as e:
    print(f"Error processing gene annotation data: {e}")
    is_gene_available = False


Extracting gene annotation data from SOFT file...


Successfully extracted gene annotation data with 2420403 rows

Gene annotation preview (first few rows):
{'ID': ['11715100_at', '11715101_s_at', '11715102_x_at', '11715103_x_at', '11715104_s_at'], 'GeneChip Array': ['Human Genome PrimeView Array', 'Human Genome PrimeView Array', 'Human Genome PrimeView Array', 'Human Genome PrimeView Array', 'Human Genome PrimeView Array'], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['30-Mar-16', '30-Mar-16', '30-Mar-16', '30-Mar-16', '30-Mar-16'], 'Sequence Type': ['Consensus sequence', 'Consensus sequence', 'Consensus sequence', 'Consensus sequence', 'Consensus sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'Affymetrix Proprietary Database', 'Affymetrix Proprietary Database', 'Affymetrix Proprietary Database', 'Affymetrix Proprietary Database'], 'Transcript ID(Array Design)': ['g21264570', 'g21264570', 'g21264570', 'g22748780', 'g30039713'], 'Targ

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the columns in gene annotation for mapping
# Based on the preview, we need 'ID' (probe identifiers) and 'Gene Symbol' columns
probe_col = 'ID'
gene_col = 'Gene Symbol'

print(f"Using columns for mapping: {probe_col} → {gene_col}")

# 2. Get a gene mapping dataframe from the annotation data
try:
    gene_mapping = get_gene_mapping(gene_annotation, probe_col, gene_col)
    print(f"Successfully created gene mapping with {len(gene_mapping)} entries")
    print("\nFirst few rows of gene mapping:")
    print(preview_df(gene_mapping))
    
    # 3. Apply gene mapping to convert probe-level data to gene expression data
    print("\nConverting probe-level measurements to gene expression data...")
    gene_data = apply_gene_mapping(gene_data, gene_mapping)
    
    print(f"Successfully mapped gene expression data with {len(gene_data.index)} genes and {len(gene_data.columns)} samples")
    print("\nPreview of the first 5 genes in the gene expression data:")
    print(preview_df(gene_data, n=5))
    
    # Save the gene data to file
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    gene_data.to_csv(out_gene_data_file)
    print(f"\nGene expression data saved to: {out_gene_data_file}")
    
except Exception as e:
    print(f"Error during gene mapping: {e}")
    is_gene_available = False


Using columns for mapping: ID → Gene Symbol
Successfully created gene mapping with 49372 entries

First few rows of gene mapping:
{'ID': ['11715100_at', '11715101_s_at', '11715102_x_at', '11715103_x_at', '11715104_s_at'], 'Gene': ['HIST1H3G', 'HIST1H3G', 'HIST1H3G', 'TNFAIP8L1', 'OTOP2']}

Converting probe-level measurements to gene expression data...


Successfully mapped gene expression data with 19963 genes and 48 samples

Preview of the first 5 genes in the gene expression data:
{'GSM5765052': [5.79996214940397, 7.985737601323921, 3.95521711983455, 6.80095960044258, 3.29673722783375], 'GSM5765053': [5.70050934220473, 8.14504659433512, 3.97028916746449, 6.41699115088219, 3.33718758499198], 'GSM5765054': [5.77833308521097, 8.26152764930688, 4.77559913915191, 6.73930543823645, 3.42574620558311], 'GSM5765055': [5.60298546993569, 8.30240147227909, 4.10895739292915, 6.7316351275553306, 3.27853557754664], 'GSM5765056': [5.98648796769992, 7.90267675598507, 4.53228126790821, 6.67851790884874, 3.1963218146276], 'GSM5765057': [6.02190383766546, 8.362875545440499, 4.18080357398925, 6.53625729831892, 3.47395892202044], 'GSM5765058': [5.79830063254858, 8.595736884988131, 4.12401637916933, 6.96317338451082, 3.39024123509017], 'GSM5765059': [5.70334249096542, 8.29022757744047, 4.14085366737952, 7.12031103597487, 3.46990904073522], 'GSM5765060': [


Gene expression data saved to: ../../output/preprocess/Intellectual_Disability/gene_data/GSE192767.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
print("\nNormalizing gene symbols...")
try:
    gene_data = normalize_gene_symbols_in_index(gene_data)
    print(f"After normalization, gene data has {len(gene_data.index)} genes")
    
    # Save the normalized gene data
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    gene_data.to_csv(out_gene_data_file)
    print(f"Normalized gene data saved to: {out_gene_data_file}")
    is_gene_available = True
except Exception as e:
    print(f"Error during gene symbol normalization: {e}")
    is_gene_available = False

# 2. Load the clinical data and link with genetic data
print("\nLoading clinical data and linking with genetic data...")
try:
    # Load the clinical data
    clinical_df = pd.read_csv(out_clinical_data_file, index_col=0)
    print(f"Loaded clinical data with shape: {clinical_df.shape}")
    
    # Print sample IDs from both datasets for debugging
    print("First few clinical sample columns:", list(clinical_df.columns)[:5])
    print("First few genetic sample columns:", list(gene_data.columns)[:5])
    
    # Convert column names in gene_data to match clinical_df format
    gene_data_renamed = gene_data.copy()
    sample_mapping = {}
    
    # Check if we need to transform sample IDs for matching
    if set(clinical_df.columns).isdisjoint(set(gene_data.columns)):
        print("Sample IDs don't match directly. Attempting to align based on position...")
        # If the number of samples matches, assume they're in the same order
        if len(clinical_df.columns) == len(gene_data.columns):
            sample_mapping = dict(zip(gene_data.columns, clinical_df.columns))
            gene_data_renamed.columns = clinical_df.columns
            print("Aligned samples based on position (same count of samples)")
        else:
            print("Cannot align samples - different counts in clinical and genetic data")
            raise ValueError("Sample counts don't match between clinical and genetic data")
    
    # Link clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(clinical_df, gene_data_renamed)
    print(f"Linked data shape: {linked_data.shape}")
    is_trait_available = True
    
    # 3. Handle missing values systematically
    print("\nHandling missing values...")
    linked_data = handle_missing_values(linked_data, trait)
    print(f"After handling missing values, data shape: {linked_data.shape}")
    
    # 4. Determine whether the trait and demographic features are biased
    print("\nChecking for bias in features...")
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
except Exception as e:
    print(f"Error in linking clinical and genetic data: {e}")
    is_trait_available = clinical_df is not None
    linked_data = pd.DataFrame()
    is_biased = True

# 5. Final quality validation
print("\nPerforming final validation...")
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available,
    is_biased=is_biased,
    df=linked_data,
    note="ATR-X syndrome patients as intellectual disability cases"
)

# 6. Save linked data if usable
if is_usable:
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    
    # Save linked data
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print(f"Dataset not usable for {trait} association studies. Data not saved.")


Normalizing gene symbols...
After normalization, gene data has 19758 genes


Normalized gene data saved to: ../../output/preprocess/Intellectual_Disability/gene_data/GSE192767.csv

Loading clinical data and linking with genetic data...
Loaded clinical data with shape: (1, 48)
First few clinical sample columns: ['0', '1', '2', '3', '4']
First few genetic sample columns: ['GSM5765052', 'GSM5765053', 'GSM5765054', 'GSM5765055', 'GSM5765056']
Sample IDs don't match directly. Attempting to align based on position...
Aligned samples based on position (same count of samples)
Linked data shape: (48, 19759)

Handling missing values...


After handling missing values, data shape: (48, 19759)

Checking for bias in features...
For the feature 'Intellectual_Disability', the least common label is '0.0' with 20 occurrences. This represents 41.67% of the dataset.
The distribution of the feature 'Intellectual_Disability' in this dataset is fine.


Performing final validation...


Linked data saved to ../../output/preprocess/Intellectual_Disability/GSE192767.csv
