In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Generalized_Anxiety_Disorder"
cohort = "GSE61672"

# Input paths
in_trait_dir = "../../input/GEO/Generalized_Anxiety_Disorder"
in_cohort_dir = "../../input/GEO/Generalized_Anxiety_Disorder/GSE61672"

# Output paths
out_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/GSE61672.csv"
out_gene_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/gene_data/GSE61672.csv"
out_clinical_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/clinical_data/GSE61672.csv"
json_path = "../../output/preprocess/Generalized_Anxiety_Disorder/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Blood gene expression profiles associated with symptoms of generalized anxiety disorder"
!Series_summary	"Prospective epidemiological studies found that generalized anxiety disorder (GAD) can impair immune function and increase risk for cardiovascular disease or events. Mechanisms underlying the physiological reververations of anxiety, however, are still elusive. Hence, we aimed to investigate molecular processes mediating effects of anxiety on physical health using blood gene expression profiles of 546 community participants. Of these, 179 met the status of controls and 157 cases of anxiety."
!Series_overall_design	"We examined genome-wide differential gene expression in anxiety, as well as associations between nine major modules of co-regulated transcripts in blood gene expression and anxiety. There were a total of 546 subjects."
Sample Characteristics Dictionary:
{0: ['age: 44', 'age: 59', 'age: 39', 'age: 64', 'age: 58', 'age: 45', 'age: 37', 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# Based on the background information, this is a study on blood gene expression 
# profiles related to generalized anxiety disorder. Since it mentions "genome-wide 
# differential gene expression" and "blood gene expression profiles", this dataset 
# likely contains gene expression data.
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Examining the sample characteristics dictionary:

# For trait (Generalized Anxiety Disorder):
# Row 4 has 'anxiety case/control: case', 'anxiety case/control: control'
# Row 5 also has 'anxiety case/control: control', 'anxiety case/control: case'
# This is our trait data
trait_row = 4  # or 5, but let's use 4 for consistency

# For age:
# Row 0 contains age information (e.g., 'age: 44', 'age: 59', etc.)
age_row = 0

# For gender:
# Row 1 contains sex information ('Sex: F', 'Sex: M')
gender_row = 1

# 2.2 Data Type Conversion
def convert_trait(value):
    """Convert anxiety case/control information to binary (0: control, 1: case)"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    if value.lower() == 'case':
        return 1
    elif value.lower() == 'control':
        return 0
    return None

def convert_age(value):
    """Convert age information to numeric (continuous) values"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    try:
        return float(value)
    except ValueError:
        return None

def convert_gender(value):
    """Convert gender information to binary (0: female, 1: male)"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    if value.upper() == 'F':
        return 0
    elif value.upper() == 'M':
        return 1
    return None

# 3. Save Metadata
# Determine trait data availability
is_trait_available = trait_row is not None

# Conduct initial filtering
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# If trait_row is not None, extract clinical features
if trait_row is not None:
    try:
        # First check if the clinical_data.csv file exists
        if os.path.exists(f"{in_cohort_dir}/clinical_data.csv"):
            clinical_data = pd.read_csv(f"{in_cohort_dir}/clinical_data.csv")
            
            # Extract clinical features
            selected_clinical_df = geo_select_clinical_features(
                clinical_df=clinical_data,
                trait=trait,
                trait_row=trait_row,
                convert_trait=convert_trait,
                age_row=age_row,
                convert_age=convert_age,
                gender_row=gender_row,
                convert_gender=convert_gender
            )
            
            # Preview the extracted data
            preview = preview_df(selected_clinical_df)
            print("Clinical Data Preview:")
            print(preview)
            
            # Save the extracted clinical data
            os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
            selected_clinical_df.to_csv(out_clinical_data_file, index=False)
            print(f"Clinical data saved to: {out_clinical_data_file}")
        else:
            print(f"Clinical data file not found at: {in_cohort_dir}/clinical_data.csv")
            print("Cannot proceed with clinical feature extraction.")
    except Exception as e:
        print(f"Error during clinical data processing: {e}")
        print("Cannot proceed with clinical feature extraction.")


Clinical data file not found at: ../../input/GEO/Generalized_Anxiety_Disorder/GSE61672/clinical_data.csv
Cannot proceed with clinical feature extraction.


### Step 3: Gene Data Extraction

In [4]:
# 1. Re-identify the SOFT and matrix files to ensure we have the correct paths
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Extract the gene expression data from the matrix file
gene_data = get_genetic_data(matrix_file)

# 3. Print the first 20 row IDs (gene or probe identifiers)
print("\nFirst 20 gene/probe identifiers:")
print(gene_data.index[:20])

# 4. Print the dimensions of the gene expression data
print(f"\nGene data dimensions: {gene_data.shape[0]} genes × {gene_data.shape[1]} samples")

# Note: we keep is_gene_available as True since we successfully extracted gene expression data
is_gene_available = True



First 20 gene/probe identifiers:
Index(['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651254', 'ILMN_1651262', 'ILMN_1651278', 'ILMN_1651296',
       'ILMN_1651315', 'ILMN_1651316', 'ILMN_1651336', 'ILMN_1651346',
       'ILMN_1651347', 'ILMN_1651354', 'ILMN_1651358', 'ILMN_1651364',
       'ILMN_1651378', 'ILMN_1651385', 'ILMN_1651403', 'ILMN_1651405'],
      dtype='object', name='ID')

Gene data dimensions: 12603 genes × 546 samples


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers, I notice these are ILMN_ prefixed identifiers
# These are Illumina BeadArray probe identifiers, not standard human gene symbols
# Illumina probe IDs need to be mapped to standard gene symbols for interpretation

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. First get the file paths using geo_get_relevant_filepaths function
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 3. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050', 'ILMN_1343052', 'ILMN_1343059'], 'Species': [nan, nan, nan, nan, nan], 'Source': [nan, nan, nan, nan, nan], 'Search_Key': [nan, nan, nan, nan, nan], 'Transcript': [nan, nan, nan, nan, nan], 'ILMN_Gene': [nan, nan, nan, nan, nan], 'Source_Reference_ID': [nan, nan, nan, nan, nan], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Unigene_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], 'Accession': [nan, nan, nan, nan, nan], 'Symbol': ['phage_lambda_genome', 'phage_lambda_genome', 'phage_lambda_genome:low', 'phage_lambda_genome:low', 'thrB'], 'Protein_Product': [nan, nan, nan, nan, 'thrB'], 'Probe_Id': [nan, nan, nan, nan, nan], 'Array_Address_Id': [5090180.0, 6510136.0, 7560739.0, 1450438.0, 1240647.0], 'Probe_Type': [nan, nan, nan, nan, nan], 'Probe_Start': [nan, nan, nan, nan, nan], 'SEQUENCE': ['GAATAAAGAACAATCTGCTGATGATCCCTCCGTGGATCTGATTCGTGTAA', 'CCAT

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Observe gene annotation data to determine which columns store identifiers and gene symbols
# From the previous output, 'ID' contains the probe identifiers (ILMN_*) matching gene_data
# 'Symbol' contains the gene symbols we need to map to

# 2. Get gene mapping dataframe using the library function
gene_mapping = get_gene_mapping(gene_annotation, 'ID', 'Symbol')

print("Gene mapping preview (first 5 rows):")
print(gene_mapping.head())
print(f"Total number of mappings: {len(gene_mapping)}")

# 3. Apply gene mapping to convert probe-level data to gene-level expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)

# Print out information about the resulting gene expression data
print(f"\nAfter mapping: Gene expression data dimensions: {gene_data.shape[0]} genes × {gene_data.shape[1]} samples")
print("\nFirst 10 gene symbols after mapping:")
print(gene_data.index[:10])


Gene mapping preview (first 5 rows):
             ID                     Gene
0  ILMN_1343048      phage_lambda_genome
1  ILMN_1343049      phage_lambda_genome
2  ILMN_1343050  phage_lambda_genome:low
3  ILMN_1343052  phage_lambda_genome:low
4  ILMN_1343059                     thrB
Total number of mappings: 44837



After mapping: Gene expression data dimensions: 9250 genes × 546 samples

First 10 gene symbols after mapping:
Index(['AACS', 'AADACL1', 'AAK1', 'AAMP', 'AARS', 'AARS2', 'AARSD1', 'AASDH',
       'AASDHPPT', 'AATF'],
      dtype='object', name='Gene')


### Step 7: Data Normalization and Linking

In [8]:
# 1. Re-extract the gene expression data to ensure we have it properly defined
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_data = get_genetic_data(matrix_file)

# Extract gene annotation data for mapping
gene_annotation = get_gene_annotation(soft_file)

# Based on previous output in Step 5 and 6, we know the column is 'Symbol'
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='Symbol')

# Apply gene mapping to convert probe-level data to gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)

# 1. Normalize gene symbols in the gene expression data
print("Normalizing gene symbols...")
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {normalized_gene_data.shape}")
print(f"First 5 normalized gene symbols: {normalized_gene_data.index[:5].tolist() if len(normalized_gene_data) > 0 else 'No genes after normalization'}")

# Save the normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# 2. Re-extract clinical data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# From Step 2, we identified anxiety case/control information at row 4
trait_row = 4
age_row = 0  # Age information is in row 0
gender_row = 1  # Gender information is in row 1

def convert_trait(value):
    """Convert anxiety case/control information to binary (0: control, 1: case)"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    if value.lower() == 'case':
        return 1
    elif value.lower() == 'control':
        return 0
    return None

def convert_age(value):
    """Convert age information to numeric (continuous) values"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    try:
        return float(value)
    except ValueError:
        return None

def convert_gender(value):
    """Convert gender information to binary (0: female, 1: male)"""
    if value is None or pd.isna(value):
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    if value.upper() == 'F':
        return 0
    elif value.upper() == 'M':
        return 1
    return None

# Extract clinical features with correct row indices and conversion functions
selected_clinical_df = geo_select_clinical_features(
    clinical_df=clinical_data,
    trait=trait,
    trait_row=trait_row,
    convert_trait=convert_trait,
    age_row=age_row,
    convert_age=convert_age,
    gender_row=gender_row,
    convert_gender=convert_gender
)

# Debug: Show preview of clinical data
print("Preview of clinical data:")
print(preview_df(selected_clinical_df))

# Save clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
selected_clinical_df.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 3. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(selected_clinical_df, normalized_gene_data)
print(f"Linked data shape: {linked_data.shape}")

# 4. Handle missing values
linked_data = handle_missing_values(linked_data, trait_col=trait)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 5. Determine if trait is biased
print("\nChecking for bias in the trait variable:")
# The trait in this dataset is binary (case vs control)
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)

# 6. Conduct final quality validation
is_trait_available = True  # We confirmed trait data is available in Step 2
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=is_trait_available,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset studies blood gene expression profiles associated with symptoms of generalized anxiety disorder."
)

# 7. Save linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable for trait association studies, linked data not saved.")

Normalizing gene symbols...
Gene data shape after normalization: (9070, 546)
First 5 normalized gene symbols: ['AACS', 'AAK1', 'AAMDC', 'AAMP', 'AAR2']


Normalized gene data saved to ../../output/preprocess/Generalized_Anxiety_Disorder/gene_data/GSE61672.csv
Preview of clinical data:


{'GSM1510561': [nan, 44.0, 0.0], 'GSM1510562': [nan, 59.0, 0.0], 'GSM1510563': [nan, 44.0, 0.0], 'GSM1510564': [nan, 39.0, 0.0], 'GSM1510565': [nan, 64.0, 0.0], 'GSM1510566': [nan, 58.0, 1.0], 'GSM1510567': [nan, 45.0, 1.0], 'GSM1510568': [nan, 37.0, 1.0], 'GSM1510569': [nan, 40.0, 1.0], 'GSM1510570': [nan, 39.0, 0.0], 'GSM1510571': [nan, 57.0, 1.0], 'GSM1510572': [nan, 52.0, 0.0], 'GSM1510573': [nan, 59.0, 0.0], 'GSM1510574': [nan, 57.0, 1.0], 'GSM1510575': [nan, 62.0, 0.0], 'GSM1510576': [nan, 62.0, 1.0], 'GSM1510577': [nan, 55.0, 0.0], 'GSM1510578': [nan, 55.0, 0.0], 'GSM1510579': [nan, 53.0, 1.0], 'GSM1510580': [nan, 47.0, 1.0], 'GSM1510581': [nan, 48.0, 0.0], 'GSM1510582': [nan, 49.0, 0.0], 'GSM1510583': [nan, 35.0, 1.0], 'GSM1510584': [nan, 58.0, 1.0], 'GSM1510585': [nan, 46.0, 0.0], 'GSM1510586': [nan, 54.0, 0.0], 'GSM1510587': [nan, 67.0, 1.0], 'GSM1510588': [nan, 47.0, 0.0], 'GSM1510589': [nan, 51.0, 0.0], 'GSM1510590': [nan, 34.0, 1.0], 'GSM1510591': [nan, 58.0, 1.0], 'GSM151

Data shape after handling missing values: (141, 9072)

Checking for bias in the trait variable:
For the feature 'Generalized_Anxiety_Disorder', the least common label is '1.0' with 62 occurrences. This represents 43.97% of the dataset.
The distribution of the feature 'Generalized_Anxiety_Disorder' in this dataset is fine.

Quartiles for 'Age':
  25%: 41.0
  50% (Median): 49.0
  75%: 56.0
Min: 18.0
Max: 82.0
The distribution of the feature 'Age' in this dataset is fine.

A new JSON file was created at: ../../output/preprocess/Generalized_Anxiety_Disorder/cohort_info.json


Linked data saved to ../../output/preprocess/Generalized_Anxiety_Disorder/GSE61672.csv
