In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Ovarian_Cancer"
cohort = "GSE132342"

# Input paths
in_trait_dir = "../../input/GEO/Ovarian_Cancer"
in_cohort_dir = "../../input/GEO/Ovarian_Cancer/GSE132342"

# Output paths
out_data_file = "../../output/preprocess/Ovarian_Cancer/GSE132342.csv"
out_gene_data_file = "../../output/preprocess/Ovarian_Cancer/gene_data/GSE132342.csv"
out_clinical_data_file = "../../output/preprocess/Ovarian_Cancer/clinical_data/GSE132342.csv"
json_path = "../../output/preprocess/Ovarian_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
# 1. Check what files are actually in the directory
import os
print("Files in the directory:")
files = os.listdir(in_cohort_dir)
print(files)

# 2. Find appropriate files with more flexible pattern matching
soft_file = None
matrix_file = None

for file in files:
    file_path = os.path.join(in_cohort_dir, file)
    # Look for files that might contain SOFT or matrix data with various possible extensions
    if 'soft' in file.lower() or 'family' in file.lower() or file.endswith('.soft.gz'):
        soft_file = file_path
    if 'matrix' in file.lower() or file.endswith('.txt.gz') or file.endswith('.tsv.gz'):
        matrix_file = file_path

if not soft_file:
    print("Warning: Could not find a SOFT file. Using the first .gz file as fallback.")
    gz_files = [f for f in files if f.endswith('.gz')]
    if gz_files:
        soft_file = os.path.join(in_cohort_dir, gz_files[0])

if not matrix_file:
    print("Warning: Could not find a matrix file. Using the second .gz file as fallback if available.")
    gz_files = [f for f in files if f.endswith('.gz')]
    if len(gz_files) > 1 and soft_file != os.path.join(in_cohort_dir, gz_files[1]):
        matrix_file = os.path.join(in_cohort_dir, gz_files[1])
    elif len(gz_files) == 1 and not soft_file:
        matrix_file = os.path.join(in_cohort_dir, gz_files[0])

print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# 3. Read files if found
if soft_file and matrix_file:
    # Read the matrix file to obtain background information and sample characteristics data
    background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
    clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
    
    try:
        background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
        
        # Obtain the sample characteristics dictionary from the clinical dataframe
        sample_characteristics_dict = get_unique_values_by_row(clinical_data)
        
        # Explicitly print out all the background information and the sample characteristics dictionary
        print("Background Information:")
        print(background_info)
        print("Sample Characteristics Dictionary:")
        print(sample_characteristics_dict)
    except Exception as e:
        print(f"Error processing files: {e}")
        # Try swapping files if first attempt fails
        print("Trying to swap SOFT and matrix files...")
        temp = soft_file
        soft_file = matrix_file
        matrix_file = temp
        try:
            background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
            sample_characteristics_dict = get_unique_values_by_row(clinical_data)
            print("Background Information:")
            print(background_info)
            print("Sample Characteristics Dictionary:")
            print(sample_characteristics_dict)
        except Exception as e:
            print(f"Still error after swapping: {e}")
else:
    print("Could not find necessary files for processing.")


Files in the directory:
['GSE132342_family.soft.gz', 'GSE132342_series_matrix.txt.gz']
SOFT file: ../../input/GEO/Ovarian_Cancer/GSE132342/GSE132342_family.soft.gz
Matrix file: ../../input/GEO/Ovarian_Cancer/GSE132342/GSE132342_series_matrix.txt.gz
Background Information:
!Series_title	"A gene expression prognostic signature for overall survival in patients with high-grade serous ovarian cancer"
!Series_summary	"Expression of 276 genes was associated with OS at a false discovery rate (FDR) of < 0.05 in covariate-adjusted single gene analyses. The top five genes were TAP1, ZFHX4, CXCL9, FBN1, and PTGER3 (p ≪ 0.001). The best performing signature included 101 genes and for each SD difference in the gene expression score conferred a greater than two-fold increase in risk of death (HR = 2.35 [2.02, 2.71]; p ≪ 0.001). Median survival by quintile group was 9.5, 5.4, 3.8, 3.2 and 2.3 years."
!Series_overall_design	"Expression of 513 genes was measured in formalin-fixed paraffin-embedded (FFPE

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
import gzip
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# Looking at the series title and summary, this dataset contains gene expression data for high-grade serous ovarian cancer
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Trait (Ovarian Cancer) - The dataset is specifically for HGSOC patients
# From the sample characteristics, key 4 (status) indicates survival status which is related to our trait
trait_row = 4

# Age - Available in key 8, but in quartiles format
age_row = 8

# Gender - All patients are female as indicated in key 1
# Since everyone has the same value (constant), consider it as not available
gender_row = None

# 2.2 Data Type Conversion Functions
def convert_trait(value):
    """Convert survival status to binary format (0=alive, 1=dead)"""
    if value is None:
        return None
    if isinstance(value, str) and ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if value == "0":
        return 0  # Alive
    elif value == "1":
        return 1  # Dead
    else:
        return None

def convert_age(value):
    """Convert age quartile information to ordinal values
    Note: These are not actual ages but age groups (quartiles)"""
    if value is None:
        return None
    if isinstance(value, str) and ":" in value:
        value = value.split(":", 1)[1].strip()
    
    quartile_mapping = {
        "q1": 1,  # Youngest quartile
        "q2": 2,
        "q3": 3,
        "q4": 4   # Oldest quartile
    }
    
    return quartile_mapping.get(value.lower(), None)

def convert_gender(value):
    """Convert gender to binary format (not used as all patients are female)"""
    # This function is not used as gender_row is None, but included for completeness
    if value is None:
        return None
    if isinstance(value, str) and ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if value.lower() in ["female", "f"]:
        return 0
    elif value.lower() in ["male", "m"]:
        return 1
    else:
        return None

# 3. Save Metadata (Initial Filtering)
# trait_row is not None, indicating trait data is available
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction (if trait_row is not None)
if trait_row is not None:
    # Load the matrix file
    matrix_file = f"{in_cohort_dir}/GSE132342_series_matrix.txt.gz"
    
    # Create a DataFrame to store the clinical data
    sample_data = {}
    sample_ids = []
    
    # Parse the matrix file to extract sample characteristics
    with gzip.open(matrix_file, 'rt') as f:
        for line in f:
            if line.startswith('!Sample_geo_accession'):
                sample_ids = [s.strip() for s in line.strip().split('\t')[1:]]
                for sample_id in sample_ids:
                    sample_data[sample_id] = {}
            
            elif line.startswith('!Sample_characteristics_ch1'):
                values = [v.strip() for v in line.strip().split('\t')[1:]]
                # Identify which characteristic this is
                char_type = None
                for i, value in enumerate(values):
                    if i < len(sample_ids):
                        # Parse the characteristic type and value
                        if ":" in value:
                            char_type, char_value = value.split(":", 1)
                            char_type = char_type.strip()
                            char_value = char_value.strip()
                            
                            # Store in correct row based on our identified indices
                            if char_type.lower() == "status":
                                sample_data[sample_ids[i]][trait_row] = value
                            elif char_type.lower() == "age":
                                sample_data[sample_ids[i]][age_row] = value
                            # We don't extract gender as it's a constant
            
            # Stop parsing once we've reached the data section
            if line.startswith('!series_matrix_table_begin'):
                break
    
    # Convert the dictionary to a DataFrame
    clinical_data = pd.DataFrame.from_dict(sample_data, orient='index')
    
    # Extract clinical features using the function from the library
    clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=None  # Not used as gender_row is None
    )
    
    # Preview the extracted clinical data
    preview = preview_df(clinical_df)
    print("Clinical data preview:")
    print(preview)
    
    # Check if the clinical data is empty before saving
    if clinical_df.notna().any().any():
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        
        # Save the clinical data to CSV
        clinical_df.to_csv(out_clinical_data_file)
        print(f"Clinical data saved to {out_clinical_data_file}")
    else:
        print("Warning: Extracted clinical data contains only NaN values. File not saved.")


Clinical data preview:
{}


### Step 3: Gene Data Extraction

In [4]:
# 1. First get the path to the soft and matrix files
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Looking more carefully at the background information
# This is a SuperSeries which doesn't contain direct gene expression data
# Need to investigate the soft file to find the subseries
print("This appears to be a SuperSeries. Looking at the SOFT file to find potential subseries:")

# Open the SOFT file to try to identify subseries
with gzip.open(soft_file, 'rt') as f:
    subseries_lines = []
    for i, line in enumerate(f):
        if 'Series_relation' in line and 'SuperSeries of' in line:
            subseries_lines.append(line.strip())
        if i > 1000:  # Limit search to first 1000 lines
            break

# Display the subseries found
if subseries_lines:
    print("Found potential subseries references:")
    for line in subseries_lines:
        print(line)
else:
    print("No subseries references found in the first 1000 lines of the SOFT file.")

# Despite trying to extract gene data, we expect it might fail because this is a SuperSeries
try:
    gene_data = get_genetic_data(matrix_file)
    print("\nGene data extraction result:")
    print("Number of rows:", len(gene_data))
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")
    print("This confirms the dataset is a SuperSeries without direct gene expression data.")


This appears to be a SuperSeries. Looking at the SOFT file to find potential subseries:
No subseries references found in the first 1000 lines of the SOFT file.



Gene data extraction result:
Number of rows: 513
First 20 gene/probe identifiers:
Index(['AJ294735.1:15', 'ENST00000390559.1:246', 'NM_000038.3:6850',
       'NM_000051.3:1561', 'NM_000055.2:1445', 'NM_000059.3:115',
       'NM_000075.2:1055', 'NM_000077.4:673', 'NM_000089.3:2635',
       'NM_000090.3:180', 'NM_000093.3:6345', 'NM_000125.2:1595',
       'NM_000138.3:6420', 'NM_000149.3:340', 'NM_000166.5:165',
       'NM_000181.3:1899', 'NM_000194.1:240', 'NM_000222.1:5',
       'NM_000245.2:405', 'NM_000248.2:624'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Review gene identifiers to determine if they need mapping
# The identifiers appear to be in the format of: [transcript_id]:[position]
# For example: NM_000038.3:6850, NM_000051.3:1561, etc.
# These are RefSeq transcript IDs with positions, not standard gene symbols
# We would need to map these to official gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['NM_138761.3:342', 'NM_015201.3:203', 'NM_138401.2:368', 'NM_001854.3:674', 'NM_012144.2:1692'], 'ORF': ['BAX', 'BOP1', 'MVB12A', 'COL11A1', 'DNAI1'], 'GB_ACC': ['NM_138761.3', 'NM_015201.3', 'NM_138401.2', 'NM_001854.3', 'NM_012144.2'], 'Target.Region': ['343-442', '204-303', '369-468', '675-774', '1693-1792'], 'Target.Sequence': ['TTTTTCCGAGTGGCAGCTGACATGTTTTCTGACGGCAACTTCAACTGGGGCCGGGTTGTCGCCCTTTTCTACTTTGCCAGCAAACTGGTGCTCAAGGCCC', 'ACCGGCAGCGATTCTGGCGTCTCCGACAGCGAGGAGAGTGTGTTCTCAGGCCTGGAAGATTCCGGCAGTGACAGCAGTGAGGATGATGACGAAGGCGACG', 'GGGAGCCACGGACACGGCTGTGTTTGATGTCCGGCTGAGTGGGAAGACCAAGACAGTGCCTGGATACCTTCGAATAGGGGACATGGGCGGCTTTGCCATC', 'TAATGAGCATGGTATTCAGCAAATTGGTGTTGAGGTTGGGAGATCACCTGTTTTTCTGTTTGAAGACCACACTGGAAAACCTGCCCCAGAAGACTATCCC', 'TGCCTTTGACTTCCACAAAGAGATTGACTACATGTTCCTAGTGGGCACAGAGGAGGGAAAAATCTACAAGTGCTCTAAATCCTACTCCAGCCAATTCCTC'], 'Customer.Identifier': ['BAX', 'BOP1', 'MVB12A', 'COL11A1', 'DNAI1']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify columns containing gene identifiers and gene symbols
# Based on the gene annotation preview:
# - 'ID' column contains the same kind of identifiers as in the gene expression data (e.g., NM_000038.3:6850)
# - 'ORF' column appears to contain the gene symbols (e.g., BAX, BOP1)

# 2. Extract these columns to create a mapping dataframe
prob_col = 'ID'
gene_col = 'ORF'

# Get the mapping dataframe using the library function
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)

print("Gene mapping preview:")
print(preview_df(mapping_df))

# 3. Apply the gene mapping to convert probe-level data to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)

print("Gene expression data after mapping:")
print(f"Number of genes: {len(gene_data)}")
print("Preview of first 10 genes and 5 samples:")
print(gene_data.iloc[:10, :5])

# Normalize gene symbols to ensure consistency
gene_data = normalize_gene_symbols_in_index(gene_data)
print("\nGene expression data after normalization:")
print(f"Number of genes after normalization: {len(gene_data)}")
print("Preview of first 10 genes and 5 samples:")
print(gene_data.iloc[:10, :5])


Gene mapping preview:
{'ID': ['NM_138761.3:342', 'NM_015201.3:203', 'NM_138401.2:368', 'NM_001854.3:674', 'NM_012144.2:1692'], 'Gene': ['BAX', 'BOP1', 'MVB12A', 'COL11A1', 'DNAI1']}


Gene expression data after mapping:
Number of genes: 509
Preview of first 10 genes and 5 samples:
          GSM3856606  GSM3856607  GSM3856608  GSM3856609  GSM3856610
Gene                                                                
AADAC      -4.566557   -6.818265   -5.356820   -5.167440   -5.867506
ABCA1      -3.445252   -2.185089   -2.501905   -1.023824   -1.845666
ABCB1      -2.973184   -5.553152   -5.270549   -2.882433   -5.607215
ABCC2      -5.644557   -7.057258   -7.259253   -4.602997   -7.105907
ABCC3      -1.249703   -2.327780   -1.969443   -1.484696   -1.577824
ABCC4      -2.486560   -4.686916   -4.053969   -4.489810   -4.160097
ABCC5      -2.249702   -3.948018   -2.493995   -3.097131   -2.964725
ABCE1      -2.611547   -3.330770   -2.784114   -2.785744   -2.882676
ADAM12     -7.077507   -3.977098   -6.147617   -4.188366   -4.170891
ADAMDEC1   -4.755591   -3.740383   -3.833181   -3.874987   -5.094491

Gene expression data after normalization:
Number of genes after normaliza

### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols using the NCBI Gene database synonym information
print("Normalizing gene symbols using NCBI synonym information...")
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Number of genes before normalization: {len(gene_data)}")
print(f"Number of genes after normalization: {len(normalized_gene_data)}")

# Save the normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 2. Since we determined in step 2 that there's no usable trait variation 
# (all samples are cancer cases with no controls), we'll create a clinical dataframe
# but note this limitation

# Create a clinical dataframe with the trait (Ovarian_Cancer)
sample_ids = normalized_gene_data.columns.tolist()
print(f"Sample IDs from gene data: {len(sample_ids)} samples")

# Create clinical dataframe, but note that all samples have the same trait value
clinical_df = pd.DataFrame(index=[trait], columns=sample_ids)
clinical_df.loc[trait] = 1  # All samples are ovarian cancer tumors

print(f"Clinical data shape: {clinical_df.shape}")

# Save the clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_df.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 3. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(clinical_df, normalized_gene_data)
print(f"Shape of linked data: {linked_data.shape}")

# 4. Handle missing values in the linked data
print("Handling missing values...")
linked_data_cleaned = handle_missing_values(linked_data, trait)
print(f"Shape of linked data after handling missing values: {linked_data_cleaned.shape}")

# 5. Check if the trait and demographic features are biased
print("Checking for bias in features...")
is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data_cleaned, trait)

# 6. Validate the dataset and save cohort information
note = "Dataset contains expression data for ovarian cancer tumors. All samples are tumor samples with no controls, so trait bias is expected and the dataset is not suitable for case-control analysis."
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True, 
    is_biased=is_trait_biased,
    df=unbiased_linked_data,
    note=note
)

# 7. Save the linked data if it's usable (though we expect it won't be due to trait bias)
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    unbiased_linked_data.to_csv(out_data_file)
    print(f"Saved processed linked data to {out_data_file}")
else:
    print("Dataset validation failed due to trait bias. Final linked data not saved.")

Normalizing gene symbols using NCBI synonym information...
Number of genes before normalization: 509
Number of genes after normalization: 509


Normalized gene expression data saved to ../../output/preprocess/Ovarian_Cancer/gene_data/GSE132342.csv
Sample IDs from gene data: 3769 samples
Clinical data shape: (1, 3769)
Clinical data saved to ../../output/preprocess/Ovarian_Cancer/clinical_data/GSE132342.csv
Shape of linked data: (3769, 510)
Handling missing values...


Shape of linked data after handling missing values: (3769, 510)
Checking for bias in features...
Quartiles for 'Ovarian_Cancer':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1
Max: 1
The distribution of the feature 'Ovarian_Cancer' in this dataset is severely biased.

Dataset validation failed due to trait bias. Final linked data not saved.


  df[gene_cols] = df[gene_cols].fillna(df[gene_cols].mean())
