In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE98887"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE98887"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE98887.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE98887.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE98887.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"BACH2 inhibition reverses β-cell failure in type 2 diabetes models"
!Series_summary	"Type 2 diabetes is associated with defective insulin secretion and reduced β-cell mass. Available treatments provide a temporary reprieve, but secondary failure rates are high, making insulin supplementation necessary. Reversibility of b-cell failure is a key translational question. Here, we reverse-engineered and interrogated pancreatic islet-specific regulatory networks to discover T2D-specific subpopulations characterized by metabolic-inflexibility and endocrine-progenitor/stem cell features. Single-cell gain- and loss-of-function and glucose-induced Ca++ flux analyses of top candidate MR in islet cells validated transcription factor BACH2 and associated epigenetic effectors as a key driver of T2D cell states. BACH2 knockout in T2D islets reversed cellular features of the disease, restoring a non-diabetic phenotype. BACH2-immunoreactive islet cells increased ~4

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import numpy as np
import os
import json
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# This dataset is likely to contain gene expression data as it mentions scRNA-Seq profiles
# from human inlet cells, which indicates gene expression profiling.
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability

# The sample characteristics dictionary is very limited and doesn't explicitly mention
# trait (Type_2_Diabetes), age, or gender.

# For trait: From the background info, this is a study on type 2 diabetes,
# but there's no explicit trait variable in the sample characteristics.
trait_row = None  # No specific trait row is identifiable

# For age: No age information is present
age_row = None

# For gender: No gender information is present
gender_row = None

# 2.2 Data Type Conversion
# Since none of the variables are available, we'll define conversion functions
# but they won't be used in this case

def convert_trait(value):
    """Convert trait values to binary (0: control, 1: case)"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    value = value.lower()
    if 'diabetes' in value or 't2d' in value or 'type 2' in value or 'type2' in value:
        return 1
    elif 'control' in value or 'healthy' in value or 'normal' in value:
        return 0
    return None

def convert_age(value):
    """Convert age values to continuous numeric values"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Try to extract numbers from the string
    import re
    numbers = re.findall(r'\d+', value)
    if numbers:
        return float(numbers[0])
    return None

def convert_gender(value):
    """Convert gender values to binary (0: female, 1: male)"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    value = value.lower()
    if 'female' in value or 'f' == value:
        return 0
    elif 'male' in value or 'm' == value:
        return 1
    return None

# 3. Save Metadata
# Determine trait availability
is_trait_available = trait_row is not None

# Conduct initial filtering
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Skip this step as trait_row is None, indicating clinical data is not available in the format we need


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"SOFT file: {soft_file}")
print(f"Matrix file: {matrix_file}")

# Set gene availability flag
is_gene_available = True  # Initially assume gene data is available

# First check if the matrix file contains the expected marker
found_marker = False
marker_row = None
try:
    with gzip.open(matrix_file, 'rt') as file:
        for i, line in enumerate(file):
            if "!series_matrix_table_begin" in line:
                found_marker = True
                marker_row = i
                print(f"Found the matrix table marker at line {i}")
                break
    
    if not found_marker:
        print("Warning: Could not find '!series_matrix_table_begin' marker in the file.")
        is_gene_available = False
        
    # If marker was found, try to extract gene data
    if is_gene_available:
        try:
            # Try using the library function
            gene_data = get_genetic_data(matrix_file)
            
            if gene_data.shape[0] == 0:
                print("Warning: Extracted gene data has 0 rows.")
                is_gene_available = False
            else:
                print(f"Gene data shape: {gene_data.shape}")
                # Print the first 20 gene/probe identifiers
                print("First 20 gene/probe identifiers:")
                print(gene_data.index[:20].tolist())
        except Exception as e:
            print(f"Error extracting gene data with get_genetic_data(): {e}")
            is_gene_available = False
    
    # If gene data extraction failed, examine file content to diagnose
    if not is_gene_available:
        print("Examining file content to diagnose the issue:")
        try:
            with gzip.open(matrix_file, 'rt') as file:
                # Print lines around the marker if found
                if marker_row is not None:
                    for i, line in enumerate(file):
                        if i >= marker_row - 2 and i <= marker_row + 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        if i > marker_row + 10:
                            break
                else:
                    # If marker not found, print first 10 lines
                    for i, line in enumerate(file):
                        if i < 10:
                            print(f"Line {i}: {line.strip()[:100]}...")
                        else:
                            break
        except Exception as e2:
            print(f"Error examining file: {e2}")
        
except Exception as e:
    print(f"Error processing file: {e}")
    is_gene_available = False

# Update validation information if gene data extraction failed
if not is_gene_available:
    print("Gene expression data could not be successfully extracted from this dataset.")
    # Update the validation record since gene data isn't available
    is_trait_available = False  # We already determined trait data isn't available in step 2
    validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path,
                                 is_gene_available=is_gene_available, is_trait_available=is_trait_available)

SOFT file: ../../input/GEO/Type_2_Diabetes/GSE98887/GSE98887_family.soft.gz
Matrix file: ../../input/GEO/Type_2_Diabetes/GSE98887/GSE98887_series_matrix.txt.gz
Found the matrix table marker at line 68


Examining file content to diagnose the issue:
Line 66: !Sample_relation	"SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX2805708"	"SRA: https://www.ncbi.nlm....
Line 67: !Sample_supplementary_file_1	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"NONE"	"...
Line 68: !series_matrix_table_begin...
Line 69: "ID_REF"	"GSM2617196"	"GSM2617197"	"GSM2617198"	"GSM2617199"	"GSM2617200"	"GSM2617201"	"GSM2617202"	...
Line 70: !series_matrix_table_end...
Gene expression data could not be successfully extracted from this dataset.
