In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Cardiovascular_Disease"
cohort = "GSE285666"

# Input paths
in_trait_dir = "../../input/GEO/Cardiovascular_Disease"
in_cohort_dir = "../../input/GEO/Cardiovascular_Disease/GSE285666"

# Output paths
out_data_file = "../../output/preprocess/Cardiovascular_Disease/GSE285666.csv"
out_gene_data_file = "../../output/preprocess/Cardiovascular_Disease/gene_data/GSE285666.csv"
out_clinical_data_file = "../../output/preprocess/Cardiovascular_Disease/clinical_data/GSE285666.csv"
json_path = "../../output/preprocess/Cardiovascular_Disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Exon- and gene-Level transcriptional profiling in Lymphoblastoid Cell Lines (LCLs) from Williams syndrome patients and controls"
!Series_summary	"Williams syndrome (WS), characterized by positive sociality, provides a unique model for studying transcriptional networks underlying social dysfunction, relevant to disorders like autism spectrum disorder (ASD) and schizophrenia (SCHZ). In a cohort lymphoblastoid cell lines derived from 52 individuals (34 WS patients, 18 parental controls), genome-wide exon-level arrays identified a core set of differentially expressed genes (DEGs), with WS-deleted genes ranking among the top transcripts. Findings were validated by PCR, RNA-seq, and western blots."
!Series_summary	"Network analyses revealed perturbed actin cytoskeletal signaling in excitatory dendritic spines, alongside interactions in MAPK, IGF1-PI3K-AKT-mTOR/insulin, and synaptic actin pathways. These transcriptional networks show parallels to ASD and

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# Based on background info, this dataset contains gene expression data from Affymetrix Human Exon 1.0 ST arrays
is_gene_available = True

# 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability
# The sample characteristics dictionary shows only one entry at key 0:
# It contains 'disease state' with two values: 'unaffected parental control' and 'Williams syndrome patient'
# This can be used for our trait (Cardiovascular_Disease)
trait_row = 0  # 'disease state' is at key 0
age_row = None  # No age information available
gender_row = None  # No gender information available

# 2.2 Data Type Conversion Functions

def convert_trait(value: str) -> int:
    """Convert disease state to binary trait value for Cardiovascular_Disease.
    Williams syndrome is associated with cardiovascular abnormalities."""
    if value is None:
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary: Williams syndrome patients are the cases (1), controls are (0)
    if 'williams syndrome patient' in value.lower():
        return 1
    elif 'unaffected' in value.lower() or 'control' in value.lower():
        return 0
    else:
        return None

def convert_age(value: str) -> Optional[float]:
    """Convert age information to numeric value."""
    # Not used as age data is not available
    return None

def convert_gender(value: str) -> Optional[int]:
    """Convert gender information to binary (0: female, 1: male)."""
    # Not used as gender data is not available
    return None

# 3. Save Metadata
# Determine trait availability based on trait_row
is_trait_available = trait_row is not None

# Validate and save initial cohort info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Create a DataFrame from the sample characteristics dictionary
    clinical_data = pd.DataFrame({0: ['disease state: unaffected parental control', 
                                     'disease state: Williams syndrome patient']})
    
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the selected clinical data
    preview = preview_df(selected_clinical_df)
    print("Clinical Data Preview:")
    print(preview)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Save the selected clinical features to a CSV file
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Clinical Data Preview:
{0: [0.0]}
Clinical data saved to ../../output/preprocess/Cardiovascular_Disease/clinical_data/GSE285666.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Cardiovascular_Disease/GSE285666/GSE285666_series_matrix.txt.gz
Gene data shape: (22011, 52)
First 20 gene/probe identifiers:
Index(['2315554', '2315633', '2315674', '2315739', '2315894', '2315918',
       '2315951', '2316218', '2316245', '2316379', '2316558', '2316605',
       '2316746', '2316905', '2316953', '2317246', '2317317', '2317434',
       '2317472', '2317512'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Examining the gene identifiers
# These appear to be probe IDs or numerical identifiers, not standard human gene symbols
# Human gene symbols typically follow a pattern like GAPDH, TP53, etc.
# These numerical identifiers will need to be mapped to human gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'GB_LIST', 'SPOT_ID', 'seqname', 'RANGE_GB', 'RANGE_STRAND', 'RANGE_START', 'RANGE_STOP', 'total_probes', 'gene_assignment', 'mrna_assignment', 'category']
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'GB_LIST': ['NR_024005,NR_034090,NR_024004,AK093685', 'DQ786314', nan, nan, 'DQ786265'], 'SPOT_ID': ['chr1:11884-14409', 'chr1:14760-15198', 'chr1:19408-19712', 'chr1:25142-25532', 'chr1:27563-27813'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': ['11884', '14760', '19408', '25142', '27563'], 'RANGE_STOP': ['14409', '15198', '19712', '25532', '27813'], 'total_probes': ['20', '8', '4', '4', '4'], 'gene_assignment': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu

No explicit gene symbol references found in first 1000 lines

Checking for additional annotation files in the directory:
[]


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns store the gene identifiers and gene symbols
# From the preview, we can see:
# - 'ID' column contains gene identifiers that match those in gene_data.index
# - 'gene_assignment' column contains gene symbols and annotations

# First, extract the mapping between probe IDs and gene symbols
print("Creating gene mapping...")
mapping_df = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='gene_assignment')
print(f"Initial mapping shape: {mapping_df.shape}")

# Preview the mapping data
print("\nMapping data preview:")
print(preview_df(mapping_df, n=5))

# 2. Apply the gene mapping to convert probe-level measurements to gene expression data
print("\nConverting probe-level measurements to gene expression data...")
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Gene expression data shape after mapping: {gene_data.shape}")

# Preview the first few rows of the processed gene expression data
print("\nFirst few rows of gene expression data after mapping:")
print(gene_data.head().iloc[:, :5])  # Show only first 5 columns for brevity

# Display a sample of gene symbols after mapping
print("\nSample of gene symbols after mapping:")
print(gene_data.index[:20])

# Save the gene expression data to file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"\nGene expression data saved to {out_gene_data_file}")


Creating gene mapping...
Initial mapping shape: (316481, 2)

Mapping data preview:
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'Gene': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 9 // 15q26.3 // 100288486 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// AK093685 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771', '---', '---', '---', '---']}

Converting probe-level measurements to gene expression data...


Gene expression data shape after mapping: (48895, 52)

First few rows of gene expression data after mapping:
       GSM8706502   GSM8706503   GSM8706504   GSM8706505   GSM8706506
Gene                                                                 
A-     705.687242   644.757871   530.848969   699.485877   494.625018
A-2    167.820123   141.614092    95.105952   161.367554    89.000033
A-52  3333.508407  3190.445107  3666.979423  3144.687791  3426.421530
A-E     28.445449    39.869927    30.423145    32.999802    35.006814
A-I    486.510110   392.250858   423.991434   436.110931   346.470518

Sample of gene symbols after mapping:
Index(['A-', 'A-2', 'A-52', 'A-E', 'A-I', 'A-II', 'A-IV', 'A-V', 'A0', 'A1',
       'A1-', 'A10', 'A11', 'A12', 'A13', 'A14', 'A16', 'A1BG', 'A1BG-AS',
       'A1CF'],
      dtype='object', name='Gene')



Gene expression data saved to ../../output/preprocess/Cardiovascular_Disease/gene_data/GSE285666.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols
print(f"Original gene data shape: {gene_data.shape}")

try:
    # Attempt to normalize gene symbols
    gene_data_normalized = normalize_gene_symbols_in_index(gene_data)
    print(f"Normalized gene data shape: {gene_data_normalized.shape}")
except Exception as e:
    print(f"Gene normalization failed: {e}")
    # If normalization fails, use the original gene data
    gene_data_normalized = gene_data.copy()
    print(f"Using original gene data with shape: {gene_data_normalized.shape}")

# Save the gene expression data 
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data_normalized.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")

# 2. Load the clinical data from Step 2
# Use the clinical_data from previous steps to create the clinical features
# Using the correct trait_row and conversion function from Step 2
def convert_trait(value: str) -> int:
    """Convert disease state to binary trait value for Cardiovascular_Disease.
    Williams syndrome is associated with cardiovascular abnormalities."""
    if value is None:
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary: Williams syndrome patients are the cases (1), controls are (0)
    if 'williams syndrome patient' in value.lower():
        return 1
    elif 'unaffected' in value.lower() or 'control' in value.lower():
        return 0
    else:
        return None

# Reload clinical data to ensure we have the correct structure
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
_, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# Extract clinical features using only the available trait information at row 0
clinical_features = geo_select_clinical_features(
    clinical_df=clinical_data, 
    trait=trait, 
    trait_row=0,  # Using disease state as identified in Step 2
    convert_trait=convert_trait,
    age_row=None,  # No age information available
    convert_age=None,
    gender_row=None,  # No gender information available
    convert_gender=None
)

print(f"Clinical features shape: {clinical_features.shape}")
print("Clinical features preview:")
print(preview_df(clinical_features))

# Save the clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_features.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 3. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(clinical_features, gene_data_normalized)
print(f"Linked data shape: {linked_data.shape}")
print("Linked data preview (first 5 rows, 5 columns):")
print(linked_data.iloc[:5, :5])

# 4. Handle missing values
linked_data_clean = handle_missing_values(linked_data, trait)
print(f"Linked data shape after handling missing values: {linked_data_clean.shape}")

# 5. Check for bias in the dataset
is_biased, linked_data_clean = judge_and_remove_biased_features(linked_data_clean, trait)

# 6. Conduct final quality validation
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data_clean,
    note="Dataset contains gene expression data from Williams syndrome patients and controls. Williams syndrome is associated with cardiovascular abnormalities."
)

# 7. Save the linked data if it's usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data_clean.to_csv(out_data_file, index=True)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable for associative studies. Linked data not saved.")

Original gene data shape: (48895, 52)
Normalized gene data shape: (18418, 52)


Gene expression data saved to ../../output/preprocess/Cardiovascular_Disease/gene_data/GSE285666.csv
Clinical features shape: (1, 52)
Clinical features preview:
{'GSM8706502': [0.0], 'GSM8706503': [0.0], 'GSM8706504': [0.0], 'GSM8706505': [0.0], 'GSM8706506': [0.0], 'GSM8706507': [0.0], 'GSM8706508': [0.0], 'GSM8706509': [0.0], 'GSM8706510': [0.0], 'GSM8706511': [0.0], 'GSM8706512': [0.0], 'GSM8706513': [0.0], 'GSM8706514': [0.0], 'GSM8706515': [0.0], 'GSM8706516': [0.0], 'GSM8706517': [0.0], 'GSM8706518': [0.0], 'GSM8706519': [0.0], 'GSM8706520': [1.0], 'GSM8706521': [1.0], 'GSM8706522': [1.0], 'GSM8706523': [1.0], 'GSM8706524': [1.0], 'GSM8706525': [1.0], 'GSM8706526': [1.0], 'GSM8706527': [1.0], 'GSM8706528': [1.0], 'GSM8706529': [1.0], 'GSM8706530': [1.0], 'GSM8706531': [1.0], 'GSM8706532': [1.0], 'GSM8706533': [1.0], 'GSM8706534': [1.0], 'GSM8706535': [1.0], 'GSM8706536': [1.0], 'GSM8706537': [1.0], 'GSM8706538': [1.0], 'GSM8706539': [1.0], 'GSM8706540': [1.0], 'GSM8706541': [1.0]

Linked data shape after handling missing values: (52, 18419)
For the feature 'Cardiovascular_Disease', the least common label is '0.0' with 18 occurrences. This represents 34.62% of the dataset.
The distribution of the feature 'Cardiovascular_Disease' in this dataset is fine.



Linked data saved to ../../output/preprocess/Cardiovascular_Disease/GSE285666.csv
