In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE201805"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE201805"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE201805.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE201805.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE201805.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptional profiling of primary prostate tumor in metastatic hormonesensitive prostate cancer and association with clinical outcomes: correlative analysis of the E3805 CHAARTED study"
!Series_summary	"Gene expression study of the ECOG 3805 randomized controlled trial"
!Series_overall_design	"Retrospective analysis of 160 tumor transcriptomes from ECOG 3805."
Sample Characteristics Dictionary:
{0: ['primary gleason: 5', 'primary gleason: 4', 'primary gleason: 3', 'pre-treatment psa (ng/ml): 150', 'pre-treatment psa (ng/ml): 182.1', 'pre-treatment psa (ng/ml): 336.2', 'primary gleason: 2', 'pre-treatment psa (ng/ml): 19.5'], 1: ['secondary gleason: 5', 'secondary gleason: 4', 'secondary gleason: 3', 'age: 69', 'age: 59', 'pre-treatment psa (ng/ml): 6.7', 'age: 49', 'pre-treatment psa (ng/ml): 100', 'age: 70', 'pre-treatment psa (ng/ml): 108'], 2: ['pre-treatment psa (ng/ml): 644', 'pre-treatment psa (ng/ml): 177.9', 'pre-treatment psa (ng/ml): 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import numpy as np
import re

# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains tumor transcriptomes
# which implies gene expression data is available
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# This dataset is from a clinical trial (ECOG 3805) where all subjects have prostate cancer
# There's no control group for trait comparison, so trait data is not available for case-control analysis
trait_row = None  # All samples are prostate cancer cases, no controls for comparison

# Age: Available at indices 1 and 3
age_row = 3  # Row 3 has more age entries

# Gender: Not explicitly mentioned, but since this is prostate cancer, all subjects are male
gender_row = None  # All subjects are male (implied by prostate cancer)

# 2.2 Data Type Conversion
def convert_trait(value):
    # Not needed as trait_row is None
    return None

def convert_age(value):
    if pd.isna(value):
        return None
    # Extract the numeric age value after the colon
    match = re.search(r'age:\s*(\d+)', value)
    if match:
        return int(match.group(1))
    return None

def convert_gender(value):
    # Not needed as gender_row is None
    return None

# 3. Save Metadata
# is_trait_available is False since trait_row is None (all samples are cases)
is_trait_available = False if trait_row is None else True
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is None, we cannot perform clinical feature extraction
# We need to skip this step as per the instructions


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE201805/GSE201805_series_matrix.txt.gz


Gene data shape: (22011, 160)
First 20 gene/probe identifiers:
Index(['2315554', '2315633', '2315674', '2315739', '2315894', '2315918',
       '2315951', '2316218', '2316245', '2316379', '2316558', '2316605',
       '2316746', '2316905', '2316953', '2317246', '2317317', '2317434',
       '2317472', '2317512'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers in the gene expression data
# These appear to be probe IDs (numeric identifiers), not standard human gene symbols
# Human gene symbols are typically alphanumeric like "TP53", "BRCA1", etc.
# These numeric identifiers need to be mapped to standard gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'GB_LIST', 'SPOT_ID', 'seqname', 'RANGE_GB', 'RANGE_STRAND', 'RANGE_START', 'RANGE_STOP', 'total_probes', 'gene_assignment', 'mrna_assignment', 'category']
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'GB_LIST': ['NR_024005,NR_034090,NR_024004,AK093685', 'DQ786314', nan, nan, 'DQ786265'], 'SPOT_ID': ['chr1:11884-14409', 'chr1:14760-15198', 'chr1:19408-19712', 'chr1:25142-25532', 'chr1:27563-27813'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': ['11884', '14760', '19408', '25142', '27563'], 'RANGE_STOP': ['14409', '15198', '19712', '25532', '27813'], 'total_probes': ['20', '8', '4', '4', '4'], 'gene_assignment': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu

No explicit gene symbol references found in first 1000 lines

Checking for additional annotation files in the directory:
[]


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns contain gene identifiers and gene symbols
# From the gene annotation preview, we can see:
# - 'ID' column contains numeric identifiers that match the gene expression data's index
# - 'gene_assignment' column contains gene symbols embedded in detailed annotations

# Define a better function to extract gene symbols from gene_assignment field
def extract_gene_symbols(assignment):
    if pd.isna(assignment) or assignment == '---':
        return []
    
    gene_symbols = []
    # Split by /// which separates different gene entries
    entries = assignment.split('///')
    
    for entry in entries:
        # Split by // which separates fields within each entry
        parts = entry.strip().split('//')
        if len(parts) >= 2:
            # The gene symbol should be in the second position, usually after RefSeq ID
            symbol = parts[1].strip()
            # Exclude common artifacts and non-gene entries
            if symbol and symbol != '---' and len(symbol) < 20 and not symbol.startswith(('NR_', 'XR_', 'AK', 'BC')):
                gene_symbols.append(symbol)
    
    return gene_symbols

# Check how the function works on a sample
sample_assignment = gene_annotation['gene_assignment'].dropna().iloc[0]
print("Example gene_assignment value:")
print(sample_assignment)
print("\nExtracted gene symbols:")
print(extract_gene_symbols(sample_assignment))

# Create a mapping DataFrame
mapping_df = pd.DataFrame()
mapping_df['ID'] = gene_annotation['ID'].astype(str)  # Ensure IDs are strings
mapping_df['Gene'] = gene_annotation['gene_assignment'].apply(extract_gene_symbols)

# Filter out rows without gene symbols
mapping_df = mapping_df[mapping_df['Gene'].apply(len) > 0]

print(f"\nMapping dataframe shape: {mapping_df.shape}")
print("Sample of mapping data:")
print(mapping_df.head())

# Check a sample of IDs from gene_data
print("\nSample IDs from gene_data:")
print(gene_data.index[:5])

# Check if any of the IDs in the mapping dataframe exist in the gene_data index
overlap_count = sum(mapping_df['ID'].isin(gene_data.index.astype(str)))
print(f"Number of probe IDs that overlap with gene expression data: {overlap_count}")

# Debug: Check if IDs are correctly matched by examining a few specific examples
sample_ids = gene_data.index[:5].tolist()
for sample_id in sample_ids:
    print(f"\nChecking ID {sample_id}:")
    matching_rows = mapping_df[mapping_df['ID'] == str(sample_id)]
    print(f"Found {len(matching_rows)} matching rows in mapping_df")
    if not matching_rows.empty:
        print(matching_rows)

# 3. Convert probe-level measurements to gene expression data
# Use the library function to apply gene mapping
gene_data = apply_gene_mapping(gene_data, mapping_df)

print(f"\nMapped gene expression data shape: {gene_data.shape}")
print("First few genes in expression data:")
if gene_data.shape[0] > 0:
    print(gene_data.index[:10])
else:
    print("No genes found in expression data")

# Normalize gene symbols (if mapping produced results)
if gene_data.shape[0] > 0:
    gene_data = normalize_gene_symbols_in_index(gene_data)
    print(f"After normalization, gene expression data shape: {gene_data.shape}")

# Save the gene data to the output file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Example gene_assignment value:
NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 9 // 15q26.3 // 100288486 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// AK093685 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771

Extracted gene symbols:
['DDX11L2', 'DDX11L9', 'DDX11L2', 'DDX11L2']



Mapping dataframe shape: (33300, 2)
Sample of mapping data:
         ID                                               Gene
0   2315100               [DDX11L2, DDX11L9, DDX11L2, DDX11L2]
10  2315125  [OR4F17, OR4F4, OR4F5, OR4F17, OR4F4, OR4F5, O...
14  2315147         [LOC100288692, LOC100289383, LOC100506283]
15  2315160                 [FLJ45445, FLJ45445, LOC100133161]
16  2315163  [LOC100132062, LOC728417, LOC100133331, NCRNA0...

Sample IDs from gene_data:
Index(['2315554', '2315633', '2315674', '2315739', '2315894'], dtype='object', name='ID')
Number of probe IDs that overlap with gene expression data: 17558

Checking ID 2315554:
Found 1 matching rows in mapping_df
          ID                                               Gene
122  2315554  [TTLL10, TTLL10, MIR200A, MIR200B, MIR429, TTL...

Checking ID 2315633:
Found 1 matching rows in mapping_df
          ID                         Gene
126  2315633  [B3GALT6, B3GALT6, B3GALT6]

Checking ID 2315674:
Found 1 matching rows in ma

### Step 7: Data Normalization and Linking

In [8]:
# 1. We need to re-run the mapping process since it didn't work in the previous attempt

# First, let's redefine gene_data from the matrix file since we may have overwritten it with empty data
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_data = get_genetic_data(matrix_file)
print(f"Original gene data shape: {gene_data.shape}")

# Better function to extract human gene symbols from gene_assignment
def extract_symbol_from_assignment(assignment):
    """Extract human gene symbols from gene_assignment string in GEO annotation."""
    if pd.isna(assignment) or assignment == '---':
        return []
    
    # Use regex to find gene symbols in format "// SYMBOL //"
    symbols = re.findall(r'//\s+([A-Z0-9-]{1,15})\s+//', assignment)
    
    # Filter out non-gene entries
    valid_symbols = [s for s in symbols if s not in ['RefSeq', 'GenBank', 'ENSEMBL']]
    
    return valid_symbols

# Create a mapping DataFrame
gene_annotation = get_gene_annotation(soft_file)
mapping_df = pd.DataFrame()
mapping_df['ID'] = gene_annotation['ID'].astype(str)
mapping_df['Gene'] = gene_annotation['gene_assignment'].apply(extract_symbol_from_assignment)

# Filter out rows without gene symbols
mapping_df = mapping_df[mapping_df['Gene'].apply(len) > 0]
print(f"Mapping dataframe shape: {mapping_df.shape}")
print("Sample of mapping data:")
print(mapping_df.head())

# Apply the mapping
gene_data_mapped = apply_gene_mapping(gene_data, mapping_df)
print(f"Mapped gene expression data shape: {gene_data_mapped.shape}")

# 1. Normalize gene symbols 
if gene_data_mapped.shape[0] > 0:
    gene_data_normalized = normalize_gene_symbols_in_index(gene_data_mapped)
    print(f"Gene data shape after normalization: {gene_data_normalized.shape}")
    
    # Save the normalized gene expression data
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    gene_data_normalized.to_csv(out_gene_data_file)
    print(f"Normalized gene expression data saved to {out_gene_data_file}")
else:
    # If mapping failed, try using the original gene data
    print("Mapping process failed to identify genes. Using original identifiers.")
    gene_data_normalized = gene_data
    
    # Save the original gene expression data
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    gene_data_normalized.to_csv(out_gene_data_file)
    print(f"Original gene expression data saved to {out_gene_data_file}")

# 2. We know from Step 2 that trait_row is None, so clinical data is not available
trait_available = False

# Create a simple dataset with only gene expression data
linked_data = gene_data_normalized.T
print(f"Linked data shape (gene expression only): {linked_data.shape}")

# 3. We don't have trait data, so we can't filter based on trait
# Just handle missing values in the gene expression data
linked_data = linked_data.fillna(linked_data.mean())
print(f"Linked data shape after handling missing values: {linked_data.shape}")

# 4. Since we don't have trait data, all samples are prostate cancer cases
# This makes the dataset biased for case-control studies
is_biased = True

# 5. Conduct final quality validation
note = "Dataset contains gene expression data from prostate cancer tumor samples from the ECOG 3805 CHAARTED study. All samples are cancer cases without controls, making it unsuitable for case-control studies. However, the gene expression data is available and could be used for other types of analyses that don't require a control group."

is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=trait_available,
    is_biased=is_biased,
    df=linked_data,
    note=note
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable for case-control studies - linked data not saved")

Original gene data shape: (22011, 160)


Mapping dataframe shape: (33240, 2)
Sample of mapping data:
         ID                                               Gene
0   2315100  [DDX11L2, 84771, DDX11L9, 100288486, DDX11L2, ...
10  2315125  [OR4F17, 81099, OR4F4, 26682, OR4F5, 79501, OR...
14  2315147  [LOC100288692, 100288692, LOC100289383, 100289...
15  2315160  [FLJ45445, 399844, FLJ45445, 399844, LOC100133...
16  2315163  [LOC100132062, 100132062, LOC728417, 728417, L...
Mapped gene expression data shape: (0, 160)
Mapping process failed to identify genes. Using original identifiers.


Original gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE201805.csv
Linked data shape (gene expression only): (160, 22011)


Linked data shape after handling missing values: (160, 22011)
Dataset deemed not usable for case-control studies - linked data not saved
