In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE192817"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE192817"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE192817.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE192817.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE192817.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Cellular plasticity upon proton irradiation determines tumor cell radiosensitivity"
!Series_summary	"Novel particle therapy was implemented into standard-of-care for cancer patients during the last years. However, experimental studies investigating cellular and molecular mechanisms are lacking although prognostic biomarker are urgently needed. The cancer stem cell (CSC)-related marker aldehyde dehydrogenase (ALDH) is known to impact on prostate cancer radiosensitivity through affecting defense against reactive oxygen species (ROS), reducing DNA damage repair and increasing cell survival. Surprisingly, we could show in a previous study that ionizing radiation itself enriches for ALDH-positive CSCs in a time- and dose-dependent manner through alteration in histone methylation.  Within the present study, we investigated CSC marker dynamics upon proton beam irradiation and hypothesized that this novel particle therapy may have increased CSC targeting 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
from typing import Callable, Optional, Dict, Any, Union

# 1. Gene Expression Data Availability
# Based on the background information, this dataset likely contains gene expression data comparing treatment effects
is_gene_available = True

# 2.1 Data Availability
# From the sample characteristics, we don't have direct patient trait information (prostate cancer status)
# as this appears to be a cell line study with different radiation treatments
trait_row = None  # No direct prostate cancer status is available for patients
age_row = None    # No age information available (cell line study)
gender_row = None # No gender information available (cell line study)

# 2.2 Data Type Conversion functions (defining them even though they won't be used in this case)
def convert_trait(value: str) -> Optional[int]:
    """Convert prostate cancer trait status to binary (if available)"""
    if value is None:
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    # Default function, not used in this dataset
    return None

def convert_age(value: str) -> Optional[float]:
    """Convert age to continuous value (if available)"""
    if value is None:
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    try:
        return float(value)
    except:
        return None

def convert_gender(value: str) -> Optional[int]:
    """Convert gender to binary value (if available)"""
    if value is None:
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip().lower()
    if 'female' in value or 'f' == value:
        return 0
    elif 'male' in value or 'm' == value:
        return 1
    return None

# 3. Save Metadata
# Determine trait data availability
is_trait_available = trait_row is not None

# Validate and save cohort information (initial filtering)
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Skip this step since trait_row is None


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE192817/GSE192817_series_matrix.txt.gz


Gene data shape: (62976, 52)
First 20 gene/probe identifiers:
Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Based on the gene identifiers shown, these appear to be numeric identifiers (1, 2, 3, ...) 
# rather than standard human gene symbols like BRCA1, TP53, etc.
# These identifiers need to be mapped to human gene symbols for biological interpretation.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'LOCUSLINK_ID', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1']
{'ID': ['1', '2', '3', '4', '5'], 'COL': ['192', '192', '192', '192', '192'], 'ROW': [328.0, 326.0, 324.0, 322.0, 320.0], 'NAME': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872'], 'SPOT_ID': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872'], 'CONTROL_TYPE': ['pos', 'pos', 'pos', 'FALSE', 'FALSE'], 'REFSEQ': [nan, nan, nan, nan, 'NM_001105533'], 'GB_ACC': [nan, nan, nan, nan, 'NM_001105533'], 'LOCUSLINK_ID': [nan, nan, nan, nan, 79974.0], 'GENE_SYMBOL': [nan, nan, nan, nan, 'CPED1'], 'GENE_NAME': [nan, nan, nan, nan, 'cadherin-like and PC-esterase domain containing 1'], 'UNIGENE_ID': [nan, nan, nan, nan, 'Hs.18

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the correct columns for mapping
# From the previous output, we can see that the 'ID' column in gene_annotation matches 
# the index of gene_data, and 'GENE_SYMBOL' contains the gene symbols we need

print("\nExamining key columns for mapping:")
print(f"Gene data index (first 5): {gene_data.index[:5].tolist()}")
print(f"Gene annotation ID column (first 5): {gene_annotation['ID'].iloc[:5].tolist()}")

# 2. Get the gene mapping dataframe
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='GENE_SYMBOL')
print(f"\nGene mapping shape: {gene_mapping.shape}")
print("Gene mapping preview:")
print(preview_df(gene_mapping, n=5))

# 3. Apply gene mapping to convert probe-level data to gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"\nMapped gene expression data shape: {gene_data.shape}")
print("First 10 gene symbols after mapping:")
print(gene_data.index[:10].tolist())

# 4. Save the gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")



Examining key columns for mapping:
Gene data index (first 5): ['1', '2', '3', '4', '5']
Gene annotation ID column (first 5): ['1', '2', '3', '4', '5']

Gene mapping shape: (51544, 2)
Gene mapping preview:
{'ID': ['5', '6', '7', '8', '12'], 'Gene': ['CPED1', 'BCOR', 'CHAC2', 'IFI30', 'GPR146']}



Mapped gene expression data shape: (29222, 52)
First 10 gene symbols after mapping:
['A1BG', 'A1BG-AS1', 'A1CF', 'A1CF-2', 'A1CF-3', 'A2M', 'A2M-1', 'A2M-AS1', 'A2ML1', 'A2MP1']


Gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE192817.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
# Let's load the gene data we saved in the previous step
gene_data = pd.read_csv(out_gene_data_file, index_col=0)
print(f"Loaded gene data shape: {gene_data.shape}")

# Normalize gene symbols
gene_data_normalized = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data_normalized.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data_normalized.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Link the clinical and genetic data
# From Step 2, we determined that trait_row is None (no trait data available)
if trait_row is not None:
    # Extract the clinical features using the function from the library
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")
    
    # Link the clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data_normalized)
    print(f"Linked data shape: {linked_data.shape}")
    
    # 3. Handle missing values systematically
    linked_data = handle_missing_values(linked_data, trait)
    print(f"Linked data shape after handling missing values: {linked_data.shape}")
    
    # 4. Determine whether the trait and demographic features are biased
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 5. Conduct final quality validation
    note = "Dataset contains gene expression data from prostate cancer tumor and normal prostate samples. Successfully mapped probe IDs to gene symbols."
    
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note=note
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset deemed not usable - linked data not saved")
else:
    print("No trait data available. Can't proceed with linking clinical and genetic data.")
    
    # Create a minimal dataframe for validation purposes 
    # (since we need a valid DataFrame when is_final=True)
    empty_df = pd.DataFrame({"dummy": [0]})
    
    # Since trait data is not available, the dataset is not usable for our purposes
    # We pass is_biased=True to indicate unusable data
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=False,
        is_biased=True,  # Setting is_biased to True since missing trait data makes dataset unusable
        df=empty_df,
        note="No trait data available in this dataset. Contains gene expression from cell lines with different radiation treatments."
    )
    print("Dataset deemed not usable due to missing trait data.")

Loaded gene data shape: (29222, 52)


Gene data shape after normalization: (20778, 52)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE192817.csv
No trait data available. Can't proceed with linking clinical and genetic data.
Abnormality detected in the cohort: GSE192817. Preprocessing failed.
Dataset deemed not usable due to missing trait data.
