In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE248619"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE248619"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE248619.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE248619.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE248619.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Prognostic implication of blood immune cell composition in Metastatic Castration-Resistant Prostate Cancer (mCRPC)"
!Series_summary	"The prognosis of patients with metastatic castration-resistant prostate cancer (mCRPC) is variable. Several blood-related prognostic factors have been reported, including transcriptional profiling of whole blood and neutrophil to lymphocyte ratio. We aimed to address the contribution of distinct whole blood immune cell components to the prognosis of these patients.  Subjects/Patients and Methods:  Pre-treatment whole blood samples from chemotherapy-naïve mCRPC patients were prospectively collected before treatment with enzalutamide. The study consisted of a training cohort including 98 patients treated with enzalutamide in a phase 2 biomarker clinical trial (NCT02288936)"
!Series_overall_design	"The study involved serial whole-blood sample collection from participants using PAX-Gene RNA, with samples stored at -80ºC.

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import re
import json
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains gene expression data from Whole Blood samples
# analyzed with GeneChip Human Transcriptome Array HTA 2.0, which is suitable for gene expression analysis.
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# From the Sample Characteristics Dictionary, we can observe:
# - In row 0, we have 'Stage: Pre-treatment' and 'Stage: Control' which indicates disease state
# - In row 1, we have 'tissue: Whole blood' which is a constant feature
# Based on the background information, this is a study on mCRPC (metastatic Castration-Resistant Prostate Cancer)
# so we can infer that 'Stage: Pre-treatment' indicates patients with CRPC and 'Stage: Control' indicates non-CRPC

trait_row = 0  # The trait (Prostate_Cancer) can be inferred from row 0
age_row = None  # Age information is not provided
gender_row = None  # Gender information is not provided (likely all male since it's prostate cancer)

# 2.2 Data Type Conversion
def convert_trait(value):
    """Convert trait values to binary format."""
    if value is None:
        return None
    
    # Extract the actual value part after colon
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Based on the background, this is a study on mCRPC patients vs controls
    if 'Pre-treatment' in value:  # mCRPC patient
        return 1
    elif 'Control' in value:  # Control
        return 0
    else:
        return None

def convert_age(value):
    """Convert age values to continuous format."""
    # Not used as age data is not available
    return None

def convert_gender(value):
    """Convert gender values to binary format."""
    # Not used as gender data is not available
    return None

# 3. Save Metadata - Initial Validation
# Determine trait data availability
is_trait_available = trait_row is not None

# Validate and save initial cohort info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if trait_row is not None:
    try:
        # Create a properly structured DataFrame from the sample characteristics
        sample_chars = {0: ['Stage: Pre-treatment', 'Stage: Control'], 1: ['tissue: Whole blood']}
        
        # We need to create a DataFrame where columns are the indices (0, 1) and 
        # each row represents a sample with their characteristics
        # First, determine how many samples we have based on the length of the first list
        num_samples = max(len(values) for values in sample_chars.values())
        
        # Create the data for the DataFrame
        data = []
        for i in range(num_samples):
            row = {}
            for col, values in sample_chars.items():
                row[col] = values[i] if i < len(values) else None
            data.append(row)
        
        clinical_data = pd.DataFrame(data)
        
        # Use the function to extract clinical features
        selected_clinical_df = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the output dataframe
        preview = preview_df(selected_clinical_df)
        print("Selected Clinical Features Preview:")
        print(preview)
        
        # Save the clinical data to a CSV file
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        selected_clinical_df.to_csv(out_clinical_data_file, index=False)
        print(f"Clinical data saved to {out_clinical_data_file}")
    except Exception as e:
        print(f"Error in clinical feature extraction: {e}")
        # Even with an error, we should still create the output directory
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)


Selected Clinical Features Preview:
{0: [1.0], 1: [nan]}
Clinical data saved to ../../output/preprocess/Prostate_Cancer/clinical_data/GSE248619.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE248619/GSE248619_series_matrix.txt.gz


Gene data shape: (67528, 100)
First 20 gene/probe identifiers:
Index(['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1',
       'TC01000004.hg.1', 'TC01000005.hg.1', 'TC01000006.hg.1',
       'TC01000007.hg.1', 'TC01000008.hg.1', 'TC01000009.hg.1',
       'TC01000010.hg.1', 'TC01000011.hg.1', 'TC01000012.hg.1',
       'TC01000013.hg.1', 'TC01000014.hg.1', 'TC01000015.hg.1',
       'TC01000016.hg.1', 'TC01000017.hg.1', 'TC01000018.hg.1',
       'TC01000019.hg.1', 'TC01000020.hg.1'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers in the gene expression data
# The identifiers 'TC01000001.hg.1', 'TC01000002.hg.1', etc. appear to be probe IDs from a 
# microarray platform rather than standard human gene symbols.
# These are likely Affymetrix Clariom S array probe IDs which need to be mapped to gene symbols.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'probeset_id', 'seqname', 'strand', 'start', 'stop', 'total_probes', 'gene_assignment', 'mrna_assignment', 'swissprot', 'unigene', 'category', 'locus type', 'notes', 'SPOT_ID']
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['11869', '29554', '69091', '160446', '317811'], 'stop': ['14409', '31109', '70008', '161525', '328581'], 'total_probes': [49.0, 60.0, 30.0, 30.0, 191.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His)

No explicit gene symbol references found in first 1000 lines

Checking for additional annotation files in the directory:
[]


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the appropriate columns from gene_annotation
# Looking at the gene_annotation preview, we can see:
# - The 'ID' column contains probe identifiers like 'TC01000001.hg.1' which match the gene expression data
# - The 'gene_assignment' column contains gene symbols and information

# For gene symbols mapping, we'll use the 'gene_assignment' column
# The format appears to be: "REF_ID // GENE_SYMBOL // description // location // gene_id"

# 2. Get a gene mapping dataframe by extracting gene identifiers and symbols
mapping_df = get_gene_mapping(gene_annotation, 'ID', 'gene_assignment')

# Let's examine a few rows of the mapping dataframe
print("\nSample of gene mapping data:")
print(preview_df(mapping_df, n=3))

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)

# Print the shape of the resulting gene expression dataframe
print(f"\nGene expression data shape after mapping: {gene_data.shape}")
print(f"First 5 gene symbols: {gene_data.index[:5].tolist()}")

# Create the output directory if it doesn't exist
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)

# Save the gene expression data to a CSV file
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")



Sample of gene mapping data:
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1'], 'Gene': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST00000408384 // MIR1302-9 // microRNA 1302-9 // --- // 100422831 /// ENST00000408384 // MIR1302-2 // microRNA 1302-2 // --- // 100302278 /// ENST00000469289 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000469289 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST00000469289 // MIR1302-9 // microRNA 1302-9 // --- // 100422831 /// ENST00000469289 // MIR1302-2 // microRNA 1302-2 // --- // 10


Gene expression data shape after mapping: (71527, 100)
First 5 gene symbols: ['A-', 'A-2', 'A-52', 'A-575C2', 'A-E']


Gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE248619.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data using the normalize_gene_symbols_in_index function
print(f"Gene data shape before normalization: {gene_data.shape}")
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Try to load the previously saved clinical data and link it with the gene data
try:
    clinical_data = pd.read_csv(out_clinical_data_file)
    print(f"Loaded clinical data with shape: {clinical_data.shape}")
    
    # Check if the clinical data contains trait information
    trait_available = trait in clinical_data.columns or 'Prostate_Cancer' in clinical_data.columns
    
    if trait_available:
        print(f"Trait information available in clinical data")
        # Convert the clinical data to the expected format for linking
        clinical_df = clinical_data.set_index(clinical_data.columns[0]) if clinical_data.shape[1] > 1 else clinical_data
        
        # Link clinical and genetic data
        linked_data = geo_link_clinical_genetic_data(clinical_df.T, gene_data)
        print(f"Linked data shape: {linked_data.shape}")
        
        # 3. Handle missing values
        linked_data_clean = handle_missing_values(linked_data, trait)
        print(f"Cleaned linked data shape: {linked_data_clean.shape}")
        
        # 4. Determine if the trait and demographic features are biased
        is_biased, linked_data_clean = judge_and_remove_biased_features(linked_data_clean, trait)
    else:
        print("No trait information found in clinical data")
        linked_data = gene_data.T
        linked_data_clean = linked_data
        is_biased = True
except Exception as e:
    print(f"Error loading or processing clinical data: {e}")
    trait_available = False
    linked_data = gene_data.T
    linked_data_clean = linked_data
    is_biased = True

# 5. Conduct final quality validation
note = "Dataset contains gene expression data from whole blood of mCRPC patients before treatment with enzalutamide. The dataset was part of a phase 2 clinical trial (NCT02288936)."

is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=trait_available,
    is_biased=is_biased,
    df=linked_data_clean,
    note=note
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data_clean.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable due to quality issues - linked data not saved")

Gene data shape before normalization: (71527, 100)
Gene data shape after normalization: (24018, 100)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE248619.csv
Loaded clinical data with shape: (1, 2)
No trait information found in clinical data
Dataset deemed not usable due to quality issues - linked data not saved
