In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Multiple_Endocrine_Neoplasia_Type_2"
cohort = "GSE19987"

# Input paths
in_trait_dir = "../../input/GEO/Multiple_Endocrine_Neoplasia_Type_2"
in_cohort_dir = "../../input/GEO/Multiple_Endocrine_Neoplasia_Type_2/GSE19987"

# Output paths
out_data_file = "../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/GSE19987.csv"
out_gene_data_file = "../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/gene_data/GSE19987.csv"
out_clinical_data_file = "../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/clinical_data/GSE19987.csv"
json_path = "../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
# 1. Check what files are actually in the directory
import os
print("Files in the directory:")
files = os.listdir(in_cohort_dir)
print(files)

# 2. Find appropriate files with more flexible pattern matching
soft_file = None
matrix_file = None

for file in files:
    file_path = os.path.join(in_cohort_dir, file)
    # Look for files that might contain SOFT or matrix data with various possible extensions
    if 'soft' in file.lower() or 'family' in file.lower() or file.endswith('.soft.gz'):
        soft_file = file_path
    if 'matrix' in file.lower() or file.endswith('.txt.gz') or file.endswith('.tsv.gz'):
        matrix_file = file_path

if not soft_file:
    print("Warning: Could not find a SOFT file. Using the first .gz file as fallback.")
    gz_files = [f for f in files if f.endswith('.gz')]
    if gz_files:
        soft_file = os.path.join(in_cohort_dir, gz_files[0])

if not matrix_file:
    print("Warning: Could not find a matrix file. Using the second .gz file as fallback if available.")
    gz_files = [f for f in files if f.endswith('.gz')]
    if len(gz_files) > 1 and soft_file != os.path.join(in_cohort_dir, gz_files[1]):
        matrix_file = os.path.join(in_cohort_dir, gz_files[1])
    elif len(gz_files) == 1 and not soft_file:
        matrix_file = os.path.join(in_cohort_dir, gz_files[0])

print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# 3. Read files if found
if soft_file and matrix_file:
    # Read the matrix file to obtain background information and sample characteristics data
    background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
    clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
    
    try:
        background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
        
        # Obtain the sample characteristics dictionary from the clinical dataframe
        sample_characteristics_dict = get_unique_values_by_row(clinical_data)
        
        # Explicitly print out all the background information and the sample characteristics dictionary
        print("Background Information:")
        print(background_info)
        print("Sample Characteristics Dictionary:")
        print(sample_characteristics_dict)
    except Exception as e:
        print(f"Error processing files: {e}")
        # Try swapping files if first attempt fails
        print("Trying to swap SOFT and matrix files...")
        temp = soft_file
        soft_file = matrix_file
        matrix_file = temp
        try:
            background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
            sample_characteristics_dict = get_unique_values_by_row(clinical_data)
            print("Background Information:")
            print(background_info)
            print("Sample Characteristics Dictionary:")
            print(sample_characteristics_dict)
        except Exception as e:
            print(f"Still error after swapping: {e}")
else:
    print("Could not find necessary files for processing.")


Files in the directory:
['GSE19987-GPL571_series_matrix.txt.gz', 'GSE19987-GPL96_series_matrix.txt.gz', 'GSE19987_family.soft.gz']
SOFT file: ../../input/GEO/Multiple_Endocrine_Neoplasia_Type_2/GSE19987/GSE19987_family.soft.gz
Matrix file: ../../input/GEO/Multiple_Endocrine_Neoplasia_Type_2/GSE19987/GSE19987-GPL96_series_matrix.txt.gz
Background Information:
!Series_title	"Germline Mutations in TMEM127 Confer Susceptibility to Pheochromocytoma"
!Series_summary	"Pheochromocytomas, catecholamine-secreting tumors of neural crest origin, are frequently hereditary. However, the molecular basis of the majority of these tumors is unknown. We identified the transmembrane-encoding gene TMEM127 on chromosome 2q11 as a new pheochromocytoma susceptibility gene. In a cohort of 103 samples, we detected truncating germline TMEM127 mutations in approximately 30% of familial tumors and about 3% of sporadic-appearing pheochromocytomas without a known genetic cause. The wild-type allele was consistently 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability Check
# This dataset mentions "expression profiling" in its overall design and it's associated with pheochromocytoma,
# which suggests it contains gene expression data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Analyzing sample characteristics dictionary for trait identification
trait_row = 1  # The genetic class row contains MEN2A and MEN2B which are related to Multiple Endocrine Neoplasia Type 2
age_row = None  # Age data is not explicitly available
gender_row = None  # Gender data is not explicitly available

# 2.2 Data Type Conversion Functions
def convert_trait(value):
    """Convert trait value to binary (0 for no trait, 1 for trait present)."""
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # MEN2A and MEN2B are both subtypes of Multiple Endocrine Neoplasia Type 2
    if value in ['MEN2A', 'MEN2B']:
        return 1  # Has the trait
    else:
        return 0  # Does not have the trait

# Age and gender conversion functions are defined even though we don't have these data
def convert_age(value):
    """Convert age value to continuous."""
    if value is None:
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def convert_gender(value):
    """Convert gender value to binary (0 for female, 1 for male)."""
    if value is None:
        return None
    if ':' in value:
        value = value.split(':', 1)[1].strip().lower()
    if value in ['female', 'f']:
        return 0
    elif value in ['male', 'm']:
        return 1
    else:
        return None

# 3. Save Metadata - Conduct initial filtering
# Trait data is available because trait_row is not None
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if trait_row is not None:
    # Create a DataFrame from the sample characteristics dictionary provided in the output
    sample_characteristics = {
        0: ['tumor type: pheochromocytoma'], 
        1: ['genetic class: NF1', 'genetic class: VHL', 'genetic class: SPOR', 
            'genetic class: B_SDHB', 'genetic class: MEN2A', 'genetic class: MEN2B', 
            'genetic class: FP_TM'], 
        2: ['tumor location: unknown', 'tumor location: adrenal', 'tumor location: extraadrenal']
    }
    
    # Convert the dictionary to a proper DataFrame structure for geo_select_clinical_features
    clinical_data = pd.DataFrame.from_dict(sample_characteristics, orient='index')
    
    # Extract clinical features using the library function
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age if age_row is not None else None,
        gender_row=gender_row,
        convert_gender=convert_gender if gender_row is not None else None
    )
    
    # Preview the extracted clinical features
    print("Preview of extracted clinical features:")
    print(preview_df(selected_clinical_df))
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Save the extracted clinical features to CSV
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of extracted clinical features:
{0: [0.0], 1: [0.0], 2: [0.0], 3: [0.0], 4: [1.0], 5: [1.0], 6: [0.0]}
Clinical data saved to ../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/clinical_data/GSE19987.csv


### Step 3: Gene Data Extraction

In [4]:
# 1. First get the path to the soft and matrix files
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Looking more carefully at the background information
# This is a SuperSeries which doesn't contain direct gene expression data
# Need to investigate the soft file to find the subseries
print("This appears to be a SuperSeries. Looking at the SOFT file to find potential subseries:")

# Open the SOFT file to try to identify subseries
with gzip.open(soft_file, 'rt') as f:
    subseries_lines = []
    for i, line in enumerate(f):
        if 'Series_relation' in line and 'SuperSeries of' in line:
            subseries_lines.append(line.strip())
        if i > 1000:  # Limit search to first 1000 lines
            break

# Display the subseries found
if subseries_lines:
    print("Found potential subseries references:")
    for line in subseries_lines:
        print(line)
else:
    print("No subseries references found in the first 1000 lines of the SOFT file.")

# Despite trying to extract gene data, we expect it might fail because this is a SuperSeries
try:
    gene_data = get_genetic_data(matrix_file)
    print("\nGene data extraction result:")
    print("Number of rows:", len(gene_data))
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")
    print("This confirms the dataset is a SuperSeries without direct gene expression data.")


This appears to be a SuperSeries. Looking at the SOFT file to find potential subseries:
No subseries references found in the first 1000 lines of the SOFT file.



Gene data extraction result:
Number of rows: 22277
First 20 gene/probe identifiers:
Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1598_g_at', '160020_at', '1729_at', '1773_at', '177_at',
       '179_at', '1861_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Looking at the gene identifiers (e.g., '1007_s_at', '1053_at', etc.)
# These are Affymetrix probe IDs from a microarray chip, not human gene symbols.
# They will need to be mapped to standard gene symbols for analysis.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X69699 /FEATURE= /DEF

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns in gene_annotation contain the desired identifiers
# Based on the gene expression data and annotation data:
# - 'ID' in gene_annotation matches the probe IDs in the gene expression data (e.g., '1007_s_at')
# - 'Gene Symbol' contains the corresponding gene symbols (e.g., 'DDR1 /// MIR4640')

# 2. Get the gene mapping dataframe using the library function
gene_mapping = get_gene_mapping(
    annotation=gene_annotation,
    prob_col='ID',
    gene_col='Gene Symbol'
)

# Inspect the mapping data
print("Gene mapping dataframe preview:")
print(preview_df(gene_mapping))

# 3. Apply the gene mapping to convert probe-level data to gene-level data
gene_data = apply_gene_mapping(expression_df=gene_data, mapping_df=gene_mapping)

# Preview the gene expression data after mapping
print("\nMapped gene expression data preview:")
print(f"Number of genes: {len(gene_data)}")
print(f"Number of samples: {gene_data.shape[1]}")
if len(gene_data) > 0:
    print("First 5 gene symbols:")
    print(gene_data.index[:5])


Gene mapping dataframe preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'Gene': ['DDR1 /// MIR4640', 'RFC2', 'HSPA6', 'PAX8', 'GUCA1A']}



Mapped gene expression data preview:
Number of genes: 13830
Number of samples: 50
First 5 gene symbols:
Index(['A1CF', 'A2M', 'A4GALT', 'A4GNT', 'AAAS'], dtype='object', name='Gene')


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Shape of gene data after normalization: {normalized_gene_data.shape}")

# Save the normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Saved normalized gene data to {out_gene_data_file}")

# 2. Load the clinical data that was already processed in step 2
try:
    # Load clinical data and examine structure
    clinical_df = pd.read_csv(out_clinical_data_file)
    print("Clinical data loaded from previous step")
    print(f"Clinical data shape: {clinical_df.shape}")
    print(f"Clinical data columns: {clinical_df.columns}")
    
    # Convert clinical_df to appropriate format if needed (it might be in wide format)
    if trait not in clinical_df.columns:
        # Reshape clinical_df to have trait as a column
        clinical_df = clinical_df.T  # Convert rows to columns if needed
        clinical_df.columns = [trait]  # Name the only column as the trait
        print(f"Transposed clinical data shape: {clinical_df.shape}")
except FileNotFoundError:
    print("Warning: Clinical data file not found. Will attempt to recreate it.")
    # Re-extract clinical features using the functions from step 2
    soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
    background_info, clinical_data = get_background_and_clinical_data(matrix_file)
    
    # Redefine trait row and conversion function to handle MEN2 data
    def convert_trait(value):
        """Convert trait value to binary (0 for no trait, 1 for trait present)."""
        if ':' in value:
            value = value.split(':', 1)[1].strip()
        
        # MEN2A and MEN2B are both subtypes of Multiple Endocrine Neoplasia Type 2
        if value in ['MEN2A', 'MEN2B']:
            return 1  # Has the trait
        else:
            return 0  # Does not have the trait
    
    clinical_df = geo_select_clinical_features(
        clinical_data,
        trait=trait,
        trait_row=1,  # Row for genetic class which includes MEN2A and MEN2B
        convert_trait=convert_trait
    )
    # Transpose to get samples as rows
    clinical_df = clinical_df.T
    clinical_df.columns = [trait]

# Debug sample IDs
print("Clinical data index:", clinical_df.index[:5], "...")
print("Gene data index (transposed):", normalized_gene_data.T.index[:5], "...")

# 3. Link clinical and genetic data using the provided function
try:
    linked_data = geo_link_clinical_genetic_data(clinical_df, normalized_gene_data)
    print(f"Shape of linked data: {linked_data.shape}")
    
    # 4. Handle missing values in the linked data
    linked_data_cleaned = handle_missing_values(linked_data, trait)
    print(f"Shape of linked data after handling missing values: {linked_data_cleaned.shape}")
    
    # 5. Check if the trait and demographic features are biased
    is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data_cleaned, trait)
    
    # 6. Validate the dataset and save cohort information
    note = "Dataset contains gene expression data from human pheochromocytoma samples with Multiple Endocrine Neoplasia Type 2."
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_trait_biased,
        df=unbiased_linked_data,
        note=note
    )
    
    # 7. Save the linked data if it's usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        unbiased_linked_data.to_csv(out_data_file)
        print(f"Saved processed linked data to {out_data_file}")
    else:
        print("Dataset validation failed. Final linked data not saved.")
        
except Exception as e:
    print(f"Error in processing data: {str(e)}")
    import traceback
    traceback.print_exc()
    
    # Create a minimal DataFrame for validation
    empty_df = pd.DataFrame(columns=[trait])
    
    # Update the cohort info with failure status
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=True,  # Consider it biased since we can't properly analyze
        df=empty_df,
        note="Failed to link clinical and genetic data. Gene expression data is available but integration failed."
    )

Shape of gene data after normalization: (13542, 50)


Saved normalized gene data to ../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/gene_data/GSE19987.csv
Clinical data loaded from previous step
Clinical data shape: (1, 7)
Clinical data columns: Index(['0', '1', '2', '3', '4', '5', '6'], dtype='object')
Transposed clinical data shape: (7, 1)
Clinical data index: Index(['0', '1', '2', '3', '4'], dtype='object') ...
Gene data index (transposed): Index(['GSM499539', 'GSM499540', 'GSM499541', 'GSM499542', 'GSM499543'], dtype='object') ...
Shape of linked data: (51, 13549)
Error in processing data: ['Multiple_Endocrine_Neoplasia_Type_2']
Abnormality detected in the cohort: GSE19987. Preprocessing failed.
A new JSON file was created at: ../../output/preprocess/Multiple_Endocrine_Neoplasia_Type_2/cohort_info.json


Traceback (most recent call last):
  File "/tmp/ipykernel_42331/3234195504.py", line 62, in <module>
    linked_data_cleaned = handle_missing_values(linked_data, trait)
  File "/media/techt/DATA/GenoAgent/tools/preprocess.py", line 430, in handle_missing_values
    df = df.dropna(subset=[trait_col])
  File "/home/techt/anaconda3/envs/agent/lib/python3.10/site-packages/pandas/core/frame.py", line 6670, in dropna
    raise KeyError(np.array(subset)[check].tolist())
KeyError: ['Multiple_Endocrine_Neoplasia_Type_2']
