In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE125341"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE125341"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE125341.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE125341.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE125341.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomic profile of YB-1 regulated downstream targets in LNCaP prostate cancer cells"
!Series_summary	"The transcription factor and RNA-interacting Y-box binding protein-1 (YB-1 protein, YBX1 gene) has gained interest as a prognostic biomarker and therapeutic target in various malignancies including prostate cancer. Using a custom prostate-cancer-focussed microarray platform, we have established a transcriptome-wide profile of YB-1 target transcripts in the androgen sensitive prostate cancer cell line LNCaP, including RNAs regulated by YB-1 at the transcriptional and post-transcriptional level; under standard culture conditions (FBS), in androgen deprived culture conditions (CSS) and following stimulation with dihydrotestosterone (DHT)."
!Series_summary	""
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"The project consists of 3 data sets / SubSeries that were processed and analysed separ

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import numpy as np
from typing import Callable, Optional, Dict, Any, Union

# 1. Gene Expression Data Availability
# Based on the background information, this appears to be transcriptomic data from prostate cancer cells,
# not just miRNA or methylation data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# Looking at the sample characteristics:
# - There's no direct patient trait data (prostate cancer status) as this is a cell line study
# - No age information is available
# - No gender information is available (all samples are from male-derived LNCaP cell line)

# 2.1 Data Availability
trait_row = None  # No patient trait data (all samples are cancer cell lines)
age_row = None    # No age data
gender_row = None # No gender data (all samples are from male cell line)

# 2.2 Data Type Conversion Functions
def convert_trait(value: str) -> Optional[int]:
    """Convert trait values to binary (0/1) format."""
    if pd.isna(value):
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # This function is defined but won't be used as trait_row is None
    return None

def convert_age(value: str) -> Optional[float]:
    """Convert age values to continuous format."""
    if pd.isna(value):
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # This function is defined but won't be used as age_row is None
    return None

def convert_gender(value: str) -> Optional[int]:
    """Convert gender values to binary format (0=female, 1=male)."""
    if pd.isna(value):
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # This function is defined but won't be used as gender_row is None
    return None

# 3. Save Metadata
# Determine trait data availability
is_trait_available = trait_row is not None

# Save information using the validate_and_save_cohort_info function
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is None, we skip this substep


A new JSON file was created at: ../../output/preprocess/Prostate_Cancer/cohort_info.json


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE125341/GSE125341_series_matrix.txt.gz


Gene data shape: (175281, 37)
First 20 gene/probe identifiers:
Index(['A_14_P100100', 'A_14_P100208', 'A_14_P100325', 'A_14_P100335',
       'A_14_P100337', 'A_14_P100385', 'A_14_P100389', 'A_14_P100419',
       'A_14_P100529', 'A_14_P100533', 'A_14_P100535', 'A_14_P100560',
       'A_14_P100627', 'A_14_P100804', 'A_14_P100819', 'A_14_P100933',
       'A_14_P100966', 'A_14_P100985', 'A_14_P101047', 'A_14_P101187'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers (e.g., A_14_P100100), these are Agilent microarray probe IDs
# These are not human gene symbols and will need to be mapped to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'chr', 'start', 'end', 'strand', 'symbol', 'geneDescription', 'EnsemblGeneID', 'geneBiotype', 'FeatureNum', 'SPOT_ID']
{'ID': ['A_14_P100100', 'A_14_P100208', 'A_14_P100325', 'A_14_P100335', 'A_14_P100337'], 'chr': ['chr14', 'chr11', 'chrX', 'chr1', 'chr12'], 'start': [23543729.0, 75801449.0, 103610199.0, 220662085.0, 6820541.0], 'end': [23543789.0, 75801509.0, 103610259.0, 220662145.0, 6820593.0], 'strand': ['+', '+', '+', '+', '+'], 'symbol': ['THTPA', 'DGAT2', 'TCEAL3', 'MARK1', 'CD4'], 'geneDescription': ['thiamine triphosphatase [Source:EntrezGene;Acc:79178]', 'diacylglycerol O-acyltransferase 2 [Source:HGNC Symbol;Acc:HGNC:16940]', 'transcription elongation factor A (SII)-like 3 [Source:HGNC Symbol;Acc:HGNC:28247]', 'MAP/microtubule affinity-regulating kinase 1 [Source:HGNC Symbol;Acc:HGNC:6896]', 'CD4 molecule [Source:HGNC Symbol;Acc:HGNC:1678]'], 'EnsemblGeneID': ['ENSG00000157306.12', 'ENSG00000062282.12', 'ENSG00000

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns in gene_annotation correspond to gene identifiers and gene symbols
# From analyzing the preview, we can see:
# - 'ID' column contains probe identifiers like A_14_P100100 matching the gene expression index
# - 'symbol' column contains the human gene symbols we need like THTPA, DGAT2, etc.

# 2. Extract gene mapping from the annotation dataframe
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='symbol')
print(f"Generated gene mapping with shape: {gene_mapping.shape}")
print("Mapping preview:")
print(gene_mapping.head())

# 3. Convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"Converted gene expression data shape: {gene_data.shape}")
print("First few gene symbols:")
print(gene_data.index[:10])

# Save the gene data to output file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Generated gene mapping with shape: (157178, 2)
Mapping preview:
             ID    Gene
0  A_14_P100100   THTPA
1  A_14_P100208   DGAT2
2  A_14_P100325  TCEAL3
3  A_14_P100335   MARK1
4  A_14_P100337     CD4


Converted gene expression data shape: (23209, 37)
First few gene symbols:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A3GALT2',
       'A4GALT', 'A4GNT', 'AAAS'],
      dtype='object', name='Gene')


Gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE125341.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
# Let's load the gene data we saved in the previous step
gene_data = pd.read_csv(out_gene_data_file, index_col=0)
print(f"Loaded gene data shape: {gene_data.shape}")

# Normalize gene symbols
gene_data_normalized = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data_normalized.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data_normalized.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Link the clinical and genetic data
# From Step 2, we determined that trait_row is None (no trait data available)
if trait_row is not None:
    # Extract the clinical features using the function from the library
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")
    
    # Link the clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data_normalized)
    print(f"Linked data shape: {linked_data.shape}")
    
    # 3. Handle missing values systematically
    linked_data = handle_missing_values(linked_data, trait)
    print(f"Linked data shape after handling missing values: {linked_data.shape}")
    
    # 4. Determine whether the trait and demographic features are biased
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 5. Conduct final quality validation
    note = "Dataset contains gene expression data from prostate cancer tumor and normal prostate samples. Successfully mapped probe IDs to gene symbols."
    
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note=note
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset deemed not usable - linked data not saved")
else:
    print("No trait data available. Can't proceed with linking clinical and genetic data.")
    
    # Create a minimal dataframe for validation purposes 
    # (since we need a valid DataFrame when is_final=True)
    empty_df = pd.DataFrame({"dummy": [0]})
    
    # Since trait data is not available, the dataset is not usable for our purposes
    # We pass is_biased=True to indicate unusable data
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=False,
        is_biased=True,  # Setting is_biased to True since missing trait data makes dataset unusable
        df=empty_df,
        note="No trait data available in this dataset. Contains gene expression from cell lines with different radiation treatments."
    )
    print("Dataset deemed not usable due to missing trait data.")

Loaded gene data shape: (23209, 37)
Gene data shape after normalization: (20709, 37)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE125341.csv
No trait data available. Can't proceed with linking clinical and genetic data.
Abnormality detected in the cohort: GSE125341. Preprocessing failed.
Dataset deemed not usable due to missing trait data.
