In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"
cohort = "GSE200879"

# Input paths
in_trait_dir = "../../input/GEO/Prostate_Cancer"
in_cohort_dir = "../../input/GEO/Prostate_Cancer/GSE200879"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/GSE200879.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/GSE200879.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/GSE200879.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomics biomarkers in prostate cancer"
!Series_summary	"Prostate cancer (PCa) is the number one cancer in men. It represents a challenge for its management due to its very high incidence but low risk of lethal cancer. Over-diagnosis and over-treatment are therefore two pitfalls. The PSA (Prostate Specific Antigen) assay used for early diagnosis and clinical or molecular prognostic factors are not sufficiently reliable to predict the evolution of the cancer and its lethal or non-lethal character. Although PCa is most often detected at a localised stage, there are almost 30% of metastatic or locally advanced forms for which treatments can slow down the evolution but cannot be curative."
!Series_summary	"With the use of high-throughput technological tools such as transcriptomics , it is becoming possible to define molecular signatures and identify predictive biomarkers of tumour aggressiveness . Here, we have analyzed 137 samples."
!Series_ov

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import numpy as np
import os
import json
from typing import Optional, Dict, Any, Callable
import re

# 1. Gene Expression Data Availability
# Based on the background information mentioning "transcriptomics", this dataset likely contains gene expression data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability

# For trait: Checking if the samples are tumor or normal tissue (row 0)
trait_row = 0  # 'tissue: tumor' vs 'tissue: normal prostate'

# For age: Age information is not available in the sample characteristics
age_row = None

# For gender: Gender information is not available in the sample characteristics
gender_row = None

# 2.2 Data Type Conversion Functions

def convert_trait(value):
    """Convert tissue type to binary format (1 for tumor, 0 for normal)."""
    if isinstance(value, str):
        # Extract the value after the colon if present
        if ':' in value:
            value = value.split(':', 1)[1].strip().lower()
        else:
            value = value.strip().lower()
        
        if 'tumor' in value:
            return 1
        elif 'normal' in value:
            return 0
    return None

def convert_age(value):
    """Convert age to continuous format."""
    # Not applicable since age data is not available
    return None

def convert_gender(value):
    """Convert gender to binary format (0 for female, 1 for male)."""
    # Not applicable since gender data is not available
    return None

# 3. Save Metadata
# Trait data is available since trait_row is not None
is_trait_available = trait_row is not None

# Validate and save cohort information
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since we've identified that trait_row is not None, we need to extract clinical features
# However, it seems the clinical_data.csv file doesn't exist yet and needs to be created
# in a previous step. For now, we will skip this part until we have more information
# about how to access or generate the clinical data.

# The following code would be executed once we have the clinical data:
# if trait_row is not None:
#     clinical_data = pd.read_csv(f"{in_cohort_dir}/clinical_data.csv", index_col=0)
#     selected_clinical_df = geo_select_clinical_features(
#         clinical_df=clinical_data,
#         trait=trait,
#         trait_row=trait_row,
#         convert_trait=convert_trait,
#         age_row=age_row,
#         convert_age=convert_age,
#         gender_row=gender_row,
#         convert_gender=convert_gender
#     )
#     
#     # Preview the data
#     preview = preview_df(selected_clinical_df)
#     print("Clinical Data Preview:")
#     print(preview)
#     
#     # Save the clinical data
#     os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
#     selected_clinical_df.to_csv(out_clinical_data_file)
#     print(f"Clinical data saved to {out_clinical_data_file}")


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Prostate_Cancer/GSE200879/GSE200879_series_matrix.txt.gz
Gene data shape: (16202, 137)
First 20 gene/probe identifiers:
Index(['GSHG0000008', 'GSHG0000017', 'GSHG0000018', 'GSHG0000026',
       'GSHG0000027', 'GSHG0000029', 'GSHG0000033', 'GSHG0000035',
       'GSHG0000036', 'GSHG0000038', 'GSHG0000046', 'GSHG0000049',
       'GSHG0000052', 'GSHG0000053', 'GSHG0000055', 'GSHG0000056',
       'GSHG0000061', 'GSHG0000064', 'GSHG0000065', 'GSHG0000074'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Examining the gene identifiers from the previous output
# These identifiers (GSHG0000008, etc.) are not standard human gene symbols
# They appear to be custom identifiers from the specific platform used in this study
# Standard human gene symbols would be like "TP53", "BRCA1", "EGFR", etc.
# Therefore, these identifiers require mapping to standard gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Let's look for platform information in the SOFT file to understand the annotation better
print("\nSearching for platform information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    for i, line in enumerate(f):
        if '!Series_platform_id' in line:
            print(line.strip())
            break
        if i > 100:  # Limit search to first 100 lines
            print("Platform ID not found in first 100 lines")
            break

# Check if the SOFT file includes any reference to gene symbols
print("\nSearching for gene symbol information in SOFT file:")
with gzip.open(soft_file, 'rt') as f:
    gene_symbol_lines = []
    for i, line in enumerate(f):
        if 'GENE_SYMBOL' in line or 'gene_symbol' in line.lower() or 'symbol' in line.lower():
            gene_symbol_lines.append(line.strip())
        if i > 1000 and len(gene_symbol_lines) > 0:  # Limit search but ensure we found something
            break
    
    if gene_symbol_lines:
        print("Found references to gene symbols:")
        for line in gene_symbol_lines[:5]:  # Show just first 5 matches
            print(line)
    else:
        print("No explicit gene symbol references found in first 1000 lines")

# Look for alternative annotation files or references in the directory
print("\nChecking for additional annotation files in the directory:")
all_files = os.listdir(in_cohort_dir)
print([f for f in all_files if 'annotation' in f.lower() or 'platform' in f.lower() or 'gpl' in f.lower()])



Gene annotation preview:
Columns in gene annotation: ['ID', 'SPOT_ID', 'Gene Symbol']
{'ID': ['GSHG0046248', 'GSHG0000008', 'GSHG0033762', 'GSHG0000017', 'GSHG0000018'], 'SPOT_ID': ['GSHG0046248', 'GSHG0000008', 'GSHG0033762', 'GSHG0000017', 'GSHG0000018'], 'Gene Symbol': ['---', '---', '---', 'ISG15', 'AGRN']}

Searching for platform information in SOFT file:
Platform ID not found in first 100 lines

Searching for gene symbol information in SOFT file:
Found references to gene symbols:
#Gene Symbol =
ID	SPOT_ID	Gene Symbol

Checking for additional annotation files in the directory:
[]


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine which columns in gene_annotation correspond to identifiers and gene symbols
# From the preview, we can see:
# - 'ID' column contains the same identifiers as in gene_data.index (e.g., GSHG0000008)
# - 'Gene Symbol' column contains the corresponding gene symbols (or '---' if no mapping exists)

# 2. Get the gene mapping dataframe by extracting relevant columns
mapping_df = get_gene_mapping(gene_annotation, 'ID', 'Gene Symbol')
print(f"Mapping dataframe shape: {mapping_df.shape}")
print("Preview of mapping dataframe:")
print(preview_df(mapping_df, n=5))

# 3. Apply the gene mapping to convert probe-level measurements to gene-level expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Gene-level expression data shape after mapping: {gene_data.shape}")
print("Preview of gene symbols after mapping:")
print(list(gene_data.index[:20]))  # Show first 20 gene symbols

# Save the gene data to file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Mapping dataframe shape: (16202, 2)
Preview of mapping dataframe:
{'ID': ['GSHG0046248', 'GSHG0000008', 'GSHG0033762', 'GSHG0000017', 'GSHG0000018'], 'Gene': ['---', '---', '---', 'ISG15', 'AGRN']}
Gene-level expression data shape after mapping: (14233, 137)
Preview of gene symbols after mapping:
['A2M', 'A2M-AS1', 'A4GALT', 'AAAS', 'AACS', 'AADAT', 'AAED1', 'AAGAB', 'AAK1', 'AAMDC', 'AAMP', 'AAR2', 'AARS', 'AARS2', 'AARSD1', 'AASDH', 'AASDHPPT', 'AASS', 'AATF', 'ABAT']


Gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE200879.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
# Let's load the gene data we saved in the previous step
gene_data = pd.read_csv(out_gene_data_file, index_col=0)
print(f"Loaded gene data shape: {gene_data.shape}")

# Normalize gene symbols
gene_data_normalized = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {gene_data_normalized.shape}")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data_normalized.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Link the clinical and genetic data
# Extract clinical features from the matrix file
# From Step 2, we determined that trait_row = 0 and we have a binary trait (tumor/normal)
if trait_row is not None:
    # Extract the clinical features using the function from the library
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")
    
    # Link the clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data_normalized)
    print(f"Linked data shape: {linked_data.shape}")
    
    # 3. Handle missing values systematically
    linked_data = handle_missing_values(linked_data, trait)
    print(f"Linked data shape after handling missing values: {linked_data.shape}")
    
    # 4. Determine whether the trait and demographic features are biased
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 5. Conduct final quality validation
    note = "Dataset contains gene expression data from prostate cancer tumor and normal prostate samples. Successfully mapped probe IDs to gene symbols."
    
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note=note
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset deemed not usable - linked data not saved")
else:
    print("No trait data available. Can't proceed with linking clinical and genetic data.")
    # Since trait data is not available, the dataset is not usable for our purposes
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=False,
        is_biased=None,
        df=pd.DataFrame(),
        note="No trait data available in this dataset."
    )
    print("Dataset deemed not usable due to missing trait data.")

Loaded gene data shape: (14233, 137)
Gene data shape after normalization: (14119, 137)


Normalized gene expression data saved to ../../output/preprocess/Prostate_Cancer/gene_data/GSE200879.csv
Clinical data saved to ../../output/preprocess/Prostate_Cancer/clinical_data/GSE200879.csv
Linked data shape: (137, 14120)


Linked data shape after handling missing values: (137, 14120)
For the feature 'Prostate_Cancer', the least common label is '0.0' with 9 occurrences. This represents 6.57% of the dataset.
The distribution of the feature 'Prostate_Cancer' in this dataset is fine.



Linked data saved to ../../output/preprocess/Prostate_Cancer/GSE200879.csv
