In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE259218"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE259218"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE259218.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE259218.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE259218.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Spatial analysis of microRNA regulation at defined tumor hypoxia levels reveals biological traits of aggressive prostate cancer"
!Series_summary	"Mechanisms regulating the gene expression program at different hypoxia severity levels in patient tumors are not understood. We aimed to determine microRNA (miRNA) regulation of this program at defined hypoxia levels from moderate to severe in prostate cancer. Biopsies from 95 patients were used, where 83 patients received the hypoxia marker pimonidazole before prostatectomy. Forty hypoxia levels were extracted from pimonidazole-stained histological sections and correlated with miRNA and gene expression profiles determined by RNA-sequencing and Illumina bead arrays. This identified miRNAs associated with moderate (n=7) and severe (n=28) hypoxia and predicted their target genes. Scores of miRNAs or target genes showed prognostic significance, as validated in external cohort of 417 patients. The target gen

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# Analyze the available data to determine gene expression data availability and clinical features

# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains miRNA data, not gene expression data
is_gene_available = False

# 2. Variable Availability and Data Type Conversion
# Looking at the sample characteristics dictionary, we have:
# - No direct trait information (prostate cancer status)
# - No age information
# - No gender information
# The study is about cell lines (PC-3 and 22Rv1) under different oxygen conditions, not human patients

# Define trait row and conversion function (None as not available)
trait_row = None
def convert_trait(value):
    return None

# Define age row and conversion function (None as not available)
age_row = None
def convert_age(value):
    return None

# Define gender row and conversion function (None as not available)
gender_row = None
def convert_gender(value):
    return None

# 3. Save Metadata
# Perform initial filtering on the usability of the dataset
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False, 
    cohort=cohort, 
    info_path=json_path, 
    is_gene_available=is_gene_available, 
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Skip this step since trait_row is None, indicating no clinical data is available
# (No need to run geo_select_clinical_features)


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE259218/GSE259218_series_matrix.txt.gz


Gene data shape: (47323, 48)
First 20 gene/probe identifiers:
Index(['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209',
       'ILMN_1651210', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651235', 'ILMN_1651236',
       'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253',
       'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651262'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# These identifiers are Illumina probe IDs (starting with "ILMN_"), not human gene symbols
# They need to be mapped to human gene symbols for proper analysis

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene', 'Source_Reference_ID', 'RefSeq_ID', 'Unigene_ID', 'Entrez_Gene_ID', 'GI', 'Accession', 'Symbol', 'Protein_Product', 'Probe_Id', 'Array_Address_Id', 'Probe_Type', 'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation', 'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component', 'Ontology_Process', 'Ontology_Function', 'Synonyms', 'Obsolete_Probe_Id', 'GB_ACC']
{'ID': ['ILMN_1343048', 'ILMN_1343049', 'ILMN_1343050', 'ILMN_1343052', 'ILMN_1343059'], 'Species': [nan, nan, nan, nan, nan], 'Source': [nan, nan, nan, nan, nan], 'Search_Key': [nan, nan, nan, nan, nan], 'Transcript': [nan, nan, nan, nan, nan], 'ILMN_Gene': [nan, nan, nan, nan, nan], 'Source_Reference_ID': [nan, nan, nan, nan, nan], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Unigene_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], '

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the correct columns for gene mapping
# Based on the gene annotation preview, 'ID' column contains the probe IDs (ILMN_*) 
# which match the gene_data index, and 'Symbol' column contains gene symbols
prob_col = "ID"
gene_col = "Symbol"

print(f"Using {prob_col} as the probe identifier column and {gene_col} as the gene symbol column")

# 2. Get the gene mapping dataframe from the gene annotation data
gene_mapping = get_gene_mapping(gene_annotation, prob_col, gene_col)
print(f"Gene mapping dataframe shape: {gene_mapping.shape}")
print("First few rows of gene mapping:")
print(gene_mapping.head())

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"Gene expression dataframe shape after mapping: {gene_data.shape}")
print("First few gene symbols after mapping:")
print(gene_data.index[:10])

# Normalize gene symbols to standard form
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene expression dataframe shape after normalization: {gene_data.shape}")
print("First few normalized gene symbols:")
print(gene_data.index[:10])


Using ID as the probe identifier column and Symbol as the gene symbol column
Gene mapping dataframe shape: (44837, 2)
First few rows of gene mapping:
             ID                     Gene
0  ILMN_1343048      phage_lambda_genome
1  ILMN_1343049      phage_lambda_genome
2  ILMN_1343050  phage_lambda_genome:low
3  ILMN_1343052  phage_lambda_genome:low
4  ILMN_1343059                     thrB


Gene expression dataframe shape after mapping: (21464, 48)
First few gene symbols after mapping:
Index(['A1BG', 'A1CF', 'A26C3', 'A2BP1', 'A2LD1', 'A2M', 'A2ML1', 'A3GALT2',
       'A4GALT', 'A4GNT'],
      dtype='object', name='Gene')
Gene expression dataframe shape after normalization: (20259, 48)
First few normalized gene symbols:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2', 'A4GALT',
       'A4GNT', 'AAA1', 'AAAS'],
      dtype='object', name='Gene')


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
# Note: gene_data was already normalized in the previous step
print(f"Gene data shape: {gene_data.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Since we determined in Step 2 that trait_row is None, we know there is no clinical data available
# No need to attempt loading a non-existent clinical file

# Set trait_available to False as determined in Step 2
trait_available = False

# 3 & 4. Since there is no clinical data to link to the gene expression data,
# we'll use the gene expression data directly for the linked data
linked_data = gene_data.T  # Transpose to match expected format
linked_data_clean = linked_data

# Since there's no trait information, the dataset is considered biased for our purposes
is_biased = True  

# 5. Conduct final quality validation
note = "Dataset contains gene expression data from prostate cancer cell lines (PC-3 and 22Rv1) under different oxygen conditions. No human patient trait information available for associative analysis."

is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=trait_available,
    is_biased=is_biased,
    df=linked_data_clean,
    note=note
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data_clean.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable due to quality issues - linked data not saved")

Gene data shape: (20259, 48)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE259218.csv
Dataset deemed not usable due to quality issues - linked data not saved
