In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE281144"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE281144"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE281144.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE281144.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE281144.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gut Adaptation After Gastric Bypass in Humans Reveals Metabolically Significant Shift in Fuel Metabolism"
!Series_summary	"Objective: Roux-en-Y gastric bypass surgery (RYGB) is among the most effective therapies for obesity and type 2 diabetes (T2D), and intestinal adaptation is a proposed mechanism for these effects. We hypothesized that intestinal adaptation precedes and relates to metabolic improvement in humans after RYGB."
!Series_summary	"Methods: This was a prospective, longitudinal first-in-human study of gene expression (GE) in Roux limb (RL) collected surgically/endoscopically from 19 patients with and without diabetes. GE was determined by microarray across 6 postoperative months, including at an early postoperative (1 month  15 days) timepoint."
!Series_summary	"Results: RL GE demonstrated tissue remodeling and metabolic reprogramming, including increased glucose and amino acid utilization. RL GE signatures were established early, bef

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains gene expression data
# "GE was determined by microarray across 6 postoperative months"
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability

# Trait (Type 2 Diabetes) is available in row 1
trait_row = 1

# Age is not available in the sample characteristics dictionary
age_row = None

# Gender is available in row 0
gender_row = 0

# 2.2 Data Type Conversion

def convert_trait(value):
    """Convert diabetes status to binary: 1 for Diabetic, 0 for Control (non-diabetic)"""
    if pd.isna(value) or value is None:
        return None
    
    # Extract the value after colon if exists
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    if 'diabetic' in value.lower():
        return 1
    elif 'control' in value.lower():
        return 0
    else:
        return None

def convert_age(value):
    """Convert age to continuous variable, but since age is not available, this function won't be used"""
    return None

def convert_gender(value):
    """Convert gender to binary: 0 for Female, 1 for Male"""
    if pd.isna(value) or value is None:
        return None
    
    # Extract the value after colon if exists
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    if 'female' in value.lower():
        return 0
    elif 'male' in value.lower():
        return 1
    else:
        return None

# 3. Save Metadata
# Check if trait data is available (trait_row is not None)
is_trait_available = trait_row is not None

# Conduct initial filtering and save info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# Skip the clinical feature extraction for now due to missing data structure
print("Completed initial dataset validation.")
print(f"Gene expression available: {is_gene_available}")
print(f"Trait data available: {is_trait_available}")


Completed initial dataset validation.
Gene expression available: True
Trait data available: True


### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)


SOFT file: ../../input/GEO/Type_2_Diabetes/GSE281144/GSE281144_family.soft.gz
Matrix file: ../../input/GEO/Type_2_Diabetes/GSE281144/GSE281144_series_matrix.txt.gz
Found the matrix table marker at line 61


Gene data shape: (70523, 34)
First 20 gene/probe identifiers:
['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st', '2827995_st', '2827996_st', '2828010_st', '2828012_st', '2835442_st', '2835447_st', '2835453_st', '2835456_st', '2835459_st', '2835461_st', '2839509_st', '2839511_st', '2839513_st', '2839515_st', '2839517_st']


### Step 4: Gene Identifier Review

In [5]:
# Examining the gene identifiers in the gene expression data
# The identifiers (e.g., '2824546_st') appear to be probe IDs from a microarray platform
# rather than standard human gene symbols (like 'BRCA1', 'TP53', etc.)
# These probe IDs will need to be mapped to human gene symbols for downstream analysis

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=3))

# Examine the columns to find gene information
print("\nExamining gene mapping columns:")
print("Column 'ID' examples (probe identifiers):")
id_samples = gene_annotation['ID'].head(5).tolist()
for i, sample in enumerate(id_samples):
    print(f"Example {i+1}: {sample}")

# Look at gene_assignment column which contains gene symbols
print("\nColumn 'gene_assignment' examples (contains gene information):")
if 'gene_assignment' in gene_annotation.columns:
    # Display a few examples of the gene_assignment column
    gene_assign_samples = gene_annotation['gene_assignment'].head(5).tolist()
    for i, sample in enumerate(gene_assign_samples):
        print(f"Example {i+1}: {sample[:100]}..." if len(str(sample)) > 100 else f"Example {i+1}: {sample}")
    
    # Extract some gene symbols from gene_assignment to verify
    print("\nExtracting gene symbols from 'gene_assignment' column:")
    gene_symbols = []
    for sample in gene_assign_samples:
        if pd.notna(sample) and '//' in str(sample):
            # The gene symbol is typically the second part after "//"
            parts = str(sample).split('//')
            if len(parts) >= 2:
                symbol = parts[1].strip()
                gene_symbols.append(symbol)
                print(f"From: {sample[:50]}... -> Symbol: {symbol}")
    
    # Identify the columns needed for gene mapping
    print("\nColumns identified for gene mapping:")
    print("- 'ID': Contains probe IDs (e.g., TC01000001.hg.1)")
    print("- 'gene_assignment': Contains gene information including symbols")
    print("\nThe gene information needs to be extracted from the 'gene_assignment' column for mapping.")
else:
    print("Error: 'gene_assignment' column not found in annotation data.")

# Check gene_assignment format and determine parsing strategy
print("\nAnalyzing gene_assignment format:")
if 'gene_assignment' in gene_annotation.columns:
    # Get a small sample of non-null entries
    samples = gene_annotation['gene_assignment'].dropna().head(3).tolist()
    
    # Display the pattern and proposed extraction strategy
    if samples:
        print("The gene_assignment column follows this pattern:")
        print(f"Example: {samples[0][:100]}..." if len(str(samples[0])) > 100 else f"Example: {samples[0]}")
        print("\nExtraction strategy:")
        print("1. Split by '//' delimiter")
        print("2. The second part after splitting typically contains the gene symbol")
        print("3. For cases with multiple genes, we'll need to handle each gene separately")
        
        # Test extraction with a sample
        if '//' in str(samples[0]):
            parts = str(samples[0]).split('//')
            if len(parts) >= 2:
                example_symbol = parts[1].strip()
                print(f"\nExample extraction: '{example_symbol}'")



Gene annotation preview:
Columns in gene annotation: ['ID', 'probeset_id', 'seqname', 'strand', 'start', 'stop', 'total_probes', 'gene_assignment', 'mrna_assignment', 'swissprot', 'unigene', 'category', 'locus type', 'notes', 'SPOT_ID']
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+'], 'start': ['11869', '29554', '69091'], 'stop': ['14409', '31109', '70008'], 'total_probes': [49.0, 60.0, 30.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 

### Step 6: Gene Identifier Mapping

In [7]:
# First, let's check the format of gene identifiers in the gene expression data
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# Get gene expression data
gene_expr_data = get_genetic_data(matrix_file)
print("Gene expression data (first few probe IDs):")
print(gene_expr_data.index[:5].tolist())

# Get gene annotation data
gene_annotation = get_gene_annotation(soft_file)

# Examining both dataframes to identify the correct mapping columns
print("\nChecking gene annotation sample to match with expression data:")
id_column = 'ID'  # This is the column matching probe IDs in gene_expr_data
gene_column = 'gene_assignment'  # This column contains gene symbols that need extraction

# Create a mapping dataframe
# We need to extract the gene symbols from the gene_assignment column
# First, create a copy with just the ID and gene_assignment columns
mapping_df = gene_annotation[[id_column, gene_column]].copy()
mapping_df = mapping_df.dropna(subset=[gene_column])  # Drop rows with no gene information

print(f"\nMapping dataframe shape before processing: {mapping_df.shape}")

# Define a function to extract gene symbols from the gene_assignment column
def extract_genes_from_assignment(assignment):
    if pd.isna(assignment):
        return []
    
    genes = []
    # Split by '///' which separates different gene assignments for the same probe
    assignments = str(assignment).split('///')
    
    for single_assignment in assignments:
        # Split by '//' which separates parts of a single gene assignment
        parts = single_assignment.strip().split('//')
        if len(parts) >= 2:
            # The gene symbol is typically the second part
            gene_symbol = parts[1].strip()
            # Extract human gene symbols from the text
            symbols = extract_human_gene_symbols(gene_symbol)
            if symbols:
                genes.extend(symbols)
    
    # Return unique genes (in case of duplicates)
    return list(dict.fromkeys(genes))

# Apply the extraction function to get gene symbols
mapping_df['Gene'] = mapping_df[gene_column].apply(extract_genes_from_assignment)

# Display the mapping results
print("\nSample of mapping with extracted gene symbols:")
sample_mapping = mapping_df.head(5)
for idx, row in sample_mapping.iterrows():
    print(f"Probe ID: {row[id_column]}")
    print(f"Extracted Genes: {row['Gene']}")
    print("-" * 40)

# Apply gene mapping to convert from probe level to gene level expression
gene_data = apply_gene_mapping(gene_expr_data, mapping_df)

# Print information about the resulting gene expression data
print(f"\nGene expression data shape after mapping: {gene_data.shape}")
print(f"Number of unique genes: {len(gene_data.index)}")
print("Sample of gene symbols after mapping:")
print(gene_data.index[:10].tolist())

# Save the gene expression data to a file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"\nGene expression data saved to: {out_gene_data_file}")

Gene expression data (first few probe IDs):
['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st']



Checking gene annotation sample to match with expression data:

Mapping dataframe shape before processing: (70753, 2)



Sample of mapping with extracted gene symbols:
Probe ID: TC01000001.hg.1
Extracted Genes: ['DDX11L1', 'DDX11L5']
----------------------------------------
Probe ID: TC01000002.hg.1
Extracted Genes: ['MIR1302-11', 'MIR1302-10', 'MIR1302-9', 'MIR1302-2', 'RP11-34P13']
----------------------------------------
Probe ID: TC01000003.hg.1
Extracted Genes: ['OR4F5']
----------------------------------------
Probe ID: TC01000004.hg.1
Extracted Genes: ['RP11-34P13']
----------------------------------------
Probe ID: TC01000005.hg.1
Extracted Genes: ['RP4-669L17']
----------------------------------------

Gene expression data shape after mapping: (0, 35)
Number of unique genes: 0
Sample of gene symbols after mapping:
[]

Gene expression data saved to: ../../output/preprocess/Type_2_Diabetes/gene_data/GSE281144.csv
