In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE178631"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE178631"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE178631.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE178631.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE178631.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A prognostic hypoxia gene signature with low heterogeneity within the dominant tumour lesion in prostate cancer patients."
!Series_summary	"Background: Hypoxia gene signatures measured in a biopsy are promising biomarkers in prostate cancer. We determined the ability of a previously developed signature to correctly classify tumours as more or less hypoxic and investigated how intratumour heterogeneity affected its biomarker performance."
!Series_summary	"Methods: The 32-gene signature was determined from gene expression data of 141 biopsies from the dominant (index) lesion of 94 patients treated with prostatectomy. Hypoxic fraction was measured by pimonidazole immunostaining of whole-mount and biopsy sections and used as reference standard for hypoxia."
!Series_summary	"Results: The signature was correlated with hypoxic fraction in whole-mount sections, and the parameters showed almost the same association with tumour aggressiveness. Gene- and pim

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import numpy as np
from typing import Optional, Dict, Any, Callable
import json

# 1. Gene Expression Data Availability
# From the background information, we can see this is gene expression data related to hypoxia signature
is_gene_available = True

# 2.1 Data Availability
# Based on the sample characteristics dictionary:
# - For trait: We can use the tumor grade or risk classification as a proxy for prostate cancer severity
# - For age: Not available in the provided sample characteristics
# - For gender: Not explicitly mentioned, but since this is prostate cancer, all patients are male

trait_row = 3  # tumor ISUP grade group reflects tumor aggressiveness
age_row = None  # Age data is not available
gender_row = None  # Gender data is not available (all male for prostate cancer)

# 2.2 Data Type Conversion Functions
def convert_trait(value_str):
    """
    Convert ISUP grade group to binary trait.
    ISUP grade groups 1-2 are considered less aggressive (0)
    ISUP grade groups 3-5 are considered more aggressive (1)
    """
    if pd.isna(value_str):
        return None
    
    # Extract the value after the colon
    if ":" in value_str:
        value = value_str.split(":", 1)[1].strip()
        
        # ISUP grade groups 1-2 are considered less aggressive
        if value in ["1", "2"]:
            return 0
        # ISUP grade groups 3-5 are considered more aggressive
        elif value in ["3", "4", "5"]:
            return 1
    
    return None

def convert_age(value_str):
    """
    Convert age data.
    """
    # No age data available
    return None

def convert_gender(value_str):
    """
    Convert gender data.
    """
    # This is a prostate cancer study, so all patients are male
    return 1

# 3. Save Metadata
# Determine trait data availability
is_trait_available = trait_row is not None

# Save the cohort information
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# If trait_row is not None, extract clinical features
if trait_row is not None:
    try:
        # First, try to find the clinical data file
        # Look for matrix file which should contain the clinical information
        matrix_files = [f for f in os.listdir(in_cohort_dir) if f.endswith('.txt') or f.endswith('.csv')]
        
        if matrix_files:
            # Use the first matrix file found
            matrix_file = os.path.join(in_cohort_dir, matrix_files[0])
            print(f"Using matrix file: {matrix_file}")
            
            # Read the matrix file - assuming it contains sample characteristics 
            clinical_data = pd.read_csv(matrix_file, sep='\t', comment='!', index_col=0)
            
            # Extract clinical features
            selected_clinical_df = geo_select_clinical_features(
                clinical_df=clinical_data,
                trait="Prostate_Cancer_Severity",
                trait_row=trait_row,
                convert_trait=convert_trait,
                age_row=age_row,
                convert_age=convert_age,
                gender_row=gender_row,
                convert_gender=convert_gender
            )
            
            # Preview the dataframe
            preview = preview_df(selected_clinical_df)
            print("Selected clinical features preview:", preview)
            
            # Save to CSV
            os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
            selected_clinical_df.to_csv(out_clinical_data_file, index=False)
            print(f"Clinical data saved to {out_clinical_data_file}")
        else:
            print("No suitable matrix files found for clinical data extraction.")
            print("Will proceed without clinical data processing.")
    except Exception as e:
        print(f"Error processing clinical data: {e}")
        print("Will proceed without clinical data processing.")


No suitable matrix files found for clinical data extraction.
Will proceed without clinical data processing.


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE178631/GSE178631_series_matrix.txt.gz


Gene data shape: (47323, 141)
First 20 gene/probe identifiers:
Index(['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209',
       'ILMN_1651210', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651235', 'ILMN_1651236',
       'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253',
       'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651262'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# These identifiers appear to be Illumina probe IDs (ILMN_*) rather than human gene symbols
# Illumina probe IDs need to be mapped to human gene symbols for proper analysis

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene', 'Source_Reference_ID', 'RefSeq_ID', 'Unigene_ID', 'Entrez_Gene_ID', 'GI', 'Accession', 'Symbol', 'Protein_Product', 'Probe_Id', 'Array_Address_Id', 'Probe_Type', 'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation', 'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component', 'Ontology_Process', 'Ontology_Function', 'Synonyms', 'Obsolete_Probe_Id', 'GB_ACC']
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050', 'ILMN_1343052', 'ILMN_1343059'], 'Species': [nan, nan, nan, nan, nan], 'Source': [nan, nan, nan, nan, nan], 'Search_Key': [nan, nan, nan, nan, nan], 'Transcript': [nan, nan, nan, nan, nan], 'ILMN_Gene': [nan, nan, nan, nan, nan], 'Source_Reference_ID': [nan, nan, nan, nan, nan], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Unigene_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], '

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine which keys in the gene annotation dataframe correspond to the gene identifiers and gene symbols
# From the gene annotation preview, we can see:
# - 'ID' column contains the Illumina probe IDs (e.g., ILMN_1343048) which match the format in gene_data
# - 'Symbol' column contains the gene symbols

# 2. Get a gene mapping dataframe
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='Symbol')
print("\nGene mapping preview:")
print(preview_df(gene_mapping, n=5))
print(f"Gene mapping shape: {gene_mapping.shape}")

# 3. Convert probe-level measurements to gene expression data by applying the gene mapping
# Use the apply_gene_mapping function from the library
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"\nConverted gene expression data shape: {gene_data.shape}")
print("\nFirst 10 gene symbols in the converted gene expression data:")
print(gene_data.index[:10])

# Save the processed gene data to a file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"\nGene expression data saved to {out_gene_data_file}")



Gene mapping preview:
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050', 'ILMN_1343052', 'ILMN_1343059'], 'Gene': ['phage_lambda_genome', 'phage_lambda_genome', 'phage_lambda_genome:low', 'phage_lambda_genome:low', 'thrB']}
Gene mapping shape: (44837, 2)

Converted gene expression data shape: (21464, 141)

First 10 gene symbols in the converted gene expression data:
Index(['A1BG', 'A1CF', 'A26C3', 'A2BP1', 'A2LD1', 'A2M', 'A2ML1', 'A3GALT2',
       'A4GALT', 'A4GNT'],
      dtype='object', name='Gene')



Gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE178631.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
# Let's load the gene data we saved in the previous step
gene_data = pd.read_csv(out_gene_data_file, index_col=0)
print(f"Loaded gene data shape: {gene_data.shape}")

# Normalize gene symbols
gene_data_normalized = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data_normalized.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data_normalized.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Link the clinical and genetic data
# From Step 2, we determined that trait_row is None (no trait data available)
if trait_row is not None:
    # Extract the clinical features using the function from the library
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")
    
    # Link the clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data_normalized)
    print(f"Linked data shape: {linked_data.shape}")
    
    # 3. Handle missing values systematically
    linked_data = handle_missing_values(linked_data, trait)
    print(f"Linked data shape after handling missing values: {linked_data.shape}")
    
    # 4. Determine whether the trait and demographic features are biased
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 5. Conduct final quality validation
    note = "Dataset contains gene expression data from prostate cancer tumor and normal prostate samples. Successfully mapped probe IDs to gene symbols."
    
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note=note
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset deemed not usable - linked data not saved")
else:
    print("No trait data available. Can't proceed with linking clinical and genetic data.")
    
    # Create a minimal dataframe for validation purposes 
    # (since we need a valid DataFrame when is_final=True)
    empty_df = pd.DataFrame({"dummy": [0]})
    
    # Since trait data is not available, the dataset is not usable for our purposes
    # We pass is_biased=True to indicate unusable data
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=False,
        is_biased=True,  # Setting is_biased to True since missing trait data makes dataset unusable
        df=empty_df,
        note="No trait data available in this dataset. Contains gene expression from cell lines with different radiation treatments."
    )
    print("Dataset deemed not usable due to missing trait data.")

Loaded gene data shape: (21464, 141)


Gene data shape after normalization: (20259, 141)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE178631.csv
Clinical data saved to ../../output/preprocess/Prostate_Cancer/clinical_data/GSE178631.csv
Linked data shape: (141, 20260)


Linked data shape after handling missing values: (139, 20260)
For the feature 'Prostate_Cancer', the least common label is '0.0' with 56 occurrences. This represents 40.29% of the dataset.
The distribution of the feature 'Prostate_Cancer' in this dataset is fine.



Linked data saved to ../../output/preprocess/Prostate_Cancer/GSE178631.csv
