In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE235003"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE235003"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE235003.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE235003.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE235003.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Prostate cancer cell lines modulated with OC2 Protein or 3' UTR"
!Series_summary	"The transcription factor ONECUT2 (OC2) is a master transcriptional regulator operating in metastatic castrate-resistant prostate cancer (mCRPC) that suppresses AR activity and promotes neural differentiation and tumor cell survival. OC2 mRNA possesses an unusually long (14,575 nt), evolutionarily conserved 3’-untranslated region (3’-UTR) with many microRNA binding sites, including up to 26 miR-9 sites. This is notable because miR-9 targets many of the same genes regulated by the OC2 protein. Paradoxically, OC2 expression is high in tissues with high miR-9 expression. The length and complex secondary structure of the OC2 mRNA suggests it is a potent master competing endogenous RNA (ceRNA) capable of sequestering miRNAs. Here we describe a novel role for the OC2 3’-UTR in lethal prostate cancer consistent with a function as a ceRNA."
!Series_summary	"A plausible ceRNA 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import numpy as np
import os
import re

# Step 1: Gene Expression Data Availability
# Based on background information, this seems to be gene expression data from cell lines
# The study is about transcription factors and mRNA, so gene expression data should be available
is_gene_available = True

# Step 2: Variable Availability and Data Type Conversion

# For trait (Prostate Cancer)
# From the sample characteristics, we can see that all samples are prostate cancer cell lines
# But there's a "treated" field that indicates whether samples are treated or not, which we can use as our trait
trait_row = 2  # This corresponds to the "treated" field

def convert_trait(value):
    """Convert trait value to binary (0 for Parent, 1 for Treated)"""
    if not isinstance(value, str):
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    if value.lower() == 'parent':
        return 0
    elif value.lower() == 'treated':
        return 1
    else:
        return None

# For age
# There is no age information in the sample characteristics - these are cell lines
age_row = None

def convert_age(value):
    """This is a placeholder as age data is not available"""
    return None

# For gender
# There is no gender information for cell lines
gender_row = None

def convert_gender(value):
    """This is a placeholder as gender data is not available"""
    return None

# Step 3: Save Metadata
# Check if trait data is available
is_trait_available = trait_row is not None

# Validate and save cohort info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# Step 4: Clinical Feature Extraction
if trait_row is not None:
    try:
        # Check if clinical_data is already defined in the environment
        if 'clinical_data' in locals() or 'clinical_data' in globals():
            # Extract clinical features
            selected_clinical_df = geo_select_clinical_features(
                clinical_df=clinical_data,
                trait=trait,
                trait_row=trait_row,
                convert_trait=convert_trait,
                age_row=age_row,
                convert_age=convert_age,
                gender_row=gender_row,
                convert_gender=convert_gender
            )
            
            # Preview the dataframe
            preview_data = preview_df(selected_clinical_df)
            print("Preview of selected clinical features:")
            print(preview_data)
            
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
            
            # Save to CSV
            selected_clinical_df.to_csv(out_clinical_data_file)
            print(f"Clinical data saved to {out_clinical_data_file}")
        else:
            print("Clinical data not available in the current environment. Skipping clinical feature extraction.")
    except Exception as e:
        print(f"Error in clinical feature extraction: {e}")
        print("Skipping clinical feature extraction.")


Preview of selected clinical features:
{'GSM7488455': [0.0], 'GSM7488456': [0.0], 'GSM7488457': [0.0], 'GSM7488458': [1.0], 'GSM7488459': [1.0], 'GSM7488460': [1.0], 'GSM7488461': [1.0], 'GSM7488462': [1.0], 'GSM7488463': [1.0], 'GSM7488464': [1.0], 'GSM7488465': [1.0], 'GSM7488466': [1.0], 'GSM7488467': [1.0], 'GSM7488468': [1.0], 'GSM7488469': [1.0], 'GSM7488470': [1.0], 'GSM7488471': [1.0], 'GSM7488472': [0.0], 'GSM7488473': [0.0], 'GSM7488474': [0.0], 'GSM7488475': [1.0], 'GSM7488476': [1.0], 'GSM7488477': [1.0], 'GSM7488478': [1.0], 'GSM7488479': [1.0], 'GSM7488480': [1.0], 'GSM7488481': [1.0], 'GSM7488482': [1.0], 'GSM7488483': [1.0], 'GSM7488484': [1.0], 'GSM7488485': [1.0], 'GSM7488486': [1.0], 'GSM7488487': [1.0], 'GSM7488488': [1.0], 'GSM7488489': [1.0]}
Clinical data saved to ../../output/preprocess/Prostate_Cancer/clinical_data/GSE235003.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE235003/GSE235003_series_matrix.txt.gz


Gene data shape: (60901, 35)
First 20 gene/probe identifiers:
Index(['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16',
       '17', '18', '19', '20', '21', '22', '23'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Looking at the identifiers from the gene expression data
# The identifiers are just numbers ('4', '5', '6', etc.) which are not 
# standard human gene symbols (like BRCA1, TP53, etc.)
# These appear to be probe IDs or some other identifiers that need mapping to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'LOCUSLINK_ID', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1']
{'ID': ['1', '2', '3', '4', '5'], 'COL': ['192', '192', '192', '192', '192'], 'ROW': [328.0, 326.0, 324.0, 322.0, 320.0], 'NAME': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872'], 'SPOT_ID': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872'], 'CONTROL_TYPE': ['pos', 'pos', 'pos', 'FALSE', 'FALSE'], 'REFSEQ': [nan, nan, nan, nan, 'NM_001105533'], 'GB_ACC': [nan, nan, nan, nan, 'NM_001105533'], 'LOCUSLINK_ID': [nan, nan, nan, nan, 79974.0], 'GENE_SYMBOL': [nan, nan, nan, nan, 'CPED1'], 'GENE_NAME': [nan, nan, nan, nan, 'cadherin-like and PC-esterase domain containing 1'], 'UNIGENE_ID': [nan, nan, nan, nan, 'Hs.18

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the columns in the gene annotation dataframe that contain the gene identifiers and gene symbols
# Based on the preview, 'ID' column contains the identifiers that match the gene expression data indices
# 'GENE_SYMBOL' column contains the gene symbols we want to map to

# 2. Get the gene mapping dataframe
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='GENE_SYMBOL')
print(f"Gene mapping dataframe shape: {gene_mapping.shape}")
print("Preview of gene mapping dataframe:")
print(preview_df(gene_mapping))

# 3. Apply the gene mapping to convert probe-level measurements to gene-level expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"Gene expression data after mapping: {gene_data.shape}")
print("Preview of first few genes and samples:")
print(gene_data.iloc[:5, :5])

# Save the processed gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene data saved to {out_gene_data_file}")


Gene mapping dataframe shape: (51544, 2)
Preview of gene mapping dataframe:
{'ID': ['5', '6', '7', '8', '12'], 'Gene': ['CPED1', 'BCOR', 'CHAC2', 'IFI30', 'GPR146']}


Gene expression data after mapping: (29222, 35)
Preview of first few genes and samples:
          GSM7488455  GSM7488456  GSM7488457  GSM7488458  GSM7488459
Gene                                                                
A1BG       10.291088    9.870861   10.300388   10.118222   10.098425
A1BG-AS1    9.028480    9.984966    9.106843    8.812607    8.901764
A1CF       10.694007   11.124004   10.736997   10.833313   11.050916
A1CF-2      5.352157    5.489374    5.252186    5.304524    5.029353
A1CF-3      5.197353    6.534139    5.898741    5.567670    6.118168


Gene data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE235003.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data using the normalize_gene_symbols_in_index function
print(f"Gene data shape before normalization: {gene_data.shape}")
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Load the previously saved clinical data
try:
    clinical_data = pd.read_csv(out_clinical_data_file, index_col=0)
    print(f"Loaded clinical data with shape: {clinical_data.shape}")
    
    # The clinical data from Step 2 has samples as columns, so we need it transposed for linking
    # The first row contains the trait values
    clinical_df = clinical_data.copy()
    print(f"Clinical data index: {clinical_df.index}")
    
    # Since we know from step 2 this dataset has the trait in the first row
    trait_available = True
    print(f"Trait information available in clinical data")
        
    # Link clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(clinical_df, gene_data)
    print(f"Linked data shape: {linked_data.shape}")
    
    # 3. Handle missing values - make sure to use the trait name as it appears in the clinical data
    # From Step 2, we know trait is "Prostate_Cancer" (as defined in context variables)
    linked_data_clean = handle_missing_values(linked_data, trait)
    print(f"Cleaned linked data shape: {linked_data_clean.shape}")
    
    # 4. Determine if the trait and demographic features are biased
    is_biased, linked_data_clean = judge_and_remove_biased_features(linked_data_clean, trait)
except Exception as e:
    print(f"Error loading or processing clinical data: {e}")
    trait_available = False
    linked_data = gene_data.T
    linked_data_clean = linked_data
    is_biased = True

# 5. Conduct final quality validation
note = "Dataset contains gene expression data from prostate cancer cell lines with different treatments. The trait of interest is whether the cell was treated or not (Parent=0, Treated=1)."

is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=trait_available,
    is_biased=is_biased,
    df=linked_data_clean,
    note=note
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data_clean.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset deemed not usable due to quality issues - linked data not saved")

Gene data shape before normalization: (29222, 35)
Gene data shape after normalization: (20778, 35)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE235003.csv
Loaded clinical data with shape: (1, 35)
Clinical data index: Index(['Prostate_Cancer'], dtype='object')
Trait information available in clinical data
Linked data shape: (35, 20779)


Cleaned linked data shape: (35, 20779)
For the feature 'Prostate_Cancer', the least common label is '0.0' with 6 occurrences. This represents 17.14% of the dataset.
The distribution of the feature 'Prostate_Cancer' in this dataset is fine.



Linked data saved to ../../output/preprocess/Prostate_Cancer/GSE235003.csv
