In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE182121"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE182121"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE182121.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE182121.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE182121.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Disrupted Circadian Oscillations in Type 2 Diabetes are Linked to Altered Rhythmic Mitochondrial Metabolism in Skeletal Muscle"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series"
Sample Characteristics Dictionary:
{0: ['disease: NGT', 'disease: T2D'], 1: ['age: 41', 'age: 69', 'age: 68', 'age: 57', 'age: 67', 'age: 60', 'age: 66', 'age: 44', 'age: 50', 'age: 52', 'age: 48', 'age: 65', 'age: 62', 'age: 63', 'age: 58', 'age: 53', 'age: 42', 'age: 46', 'age: 43'], 2: ['bmi: 24.08', 'bmi: 28.51', 'bmi: 25.46', 'bmi: 27.76', 'bmi: 23.64', 'bmi: 26.57', 'bmi: 32.76', 'bmi: 31.85', 'bmi: 29', 'bmi: 25.8', 'bmi: 24.71', 'bmi: 25.12', 'bmi: 26.89', 'bmi: 30.71', 'bmi: 26.46', 'bmi: 26.41', 'bmi: 23.74', 'bmi: 29.35', 'bmi: 24.92', 'bmi: 28.83', 'bmi: 27.78', 'bmi: 29.99', 'bmi: 26.96', 'bmi: 27.71', 'bmi: 32.24', 'bmi: 23.78', 'bmi: 27.77', 'bmi: 28.8', 'bmi: 25.63', 'bmi: 26.53

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Determine if gene expression data is available
# From the background information, it appears this might be a SuperSeries with microarray or gene expression data
# The title mentions "Metabolism in Skeletal Muscle" which suggests gene expression analysis
is_gene_available = True  # Gene expression data is likely available

# 2. Identify availability and conversion for clinical variables
# 2.1 Data Availability

# Trait (Type 2 Diabetes) data is in key 0 'disease: NGT' or 'disease: T2D'
trait_row = 0

# Age data is in key 1 
age_row = 1

# Gender data is not explicitly mentioned in the sample characteristics
gender_row = None  # No gender information available

# 2.2 Data Type Conversion

def convert_trait(value):
    """Convert disease status to binary (0 for NGT, 1 for T2D)"""
    if value is None:
        return None
    value = value.lower()
    if 'disease:' in value:
        value = value.split('disease:')[1].strip()
    
    if 't2d' in value or 'type 2 diabetes' in value:
        return 1
    elif 'ngt' in value or 'normal glucose tolerance' in value:
        return 0
    return None

def convert_age(value):
    """Convert age values to continuous numeric values"""
    if value is None:
        return None
    if 'age:' in value:
        value = value.split('age:')[1].strip()
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def convert_gender(value):
    """Convert gender values to binary (0 for female, 1 for male)"""
    # Not used in this dataset, but included for completeness
    if value is None:
        return None
    value = value.lower()
    if 'gender:' in value or 'sex:' in value:
        if ':' in value:
            value = value.split(':')[1].strip()
    
    if value in ['f', 'female']:
        return 0
    elif value in ['m', 'male']:
        return 1
    return None

# 3. Save metadata
# Determine if trait data is available (trait_row is not None)
is_trait_available = trait_row is not None

# Save initial filtering information
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Extract clinical features if trait_row is not None
# Assuming clinical_data is already defined in the environment from a previous step
if trait_row is not None and 'clinical_data' in globals():
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical features
    preview = preview_df(selected_clinical_df)
    print("Preview of extracted clinical features:")
    print(preview)
    
    # Save the extracted clinical features
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of extracted clinical features:
{'GSM5518937': [0.0, 41.0], 'GSM5518938': [1.0, 69.0], 'GSM5518939': [0.0, 68.0], 'GSM5518940': [1.0, 57.0], 'GSM5518941': [1.0, 69.0], 'GSM5518942': [0.0, 69.0], 'GSM5518943': [1.0, 67.0], 'GSM5518944': [0.0, 60.0], 'GSM5518945': [1.0, 66.0], 'GSM5518946': [0.0, 44.0], 'GSM5518947': [0.0, 50.0], 'GSM5518948': [1.0, 60.0], 'GSM5518949': [0.0, 52.0], 'GSM5518950': [1.0, 52.0], 'GSM5518951': [0.0, 48.0], 'GSM5518952': [0.0, 65.0], 'GSM5518953': [1.0, 66.0], 'GSM5518954': [1.0, 62.0], 'GSM5518955': [1.0, 57.0], 'GSM5518956': [0.0, 50.0], 'GSM5518957': [1.0, 68.0], 'GSM5518958': [1.0, 63.0], 'GSM5518959': [0.0, 68.0], 'GSM5518960': [1.0, 63.0], 'GSM5518961': [0.0, 67.0], 'GSM5518962': [1.0, 68.0], 'GSM5518963': [0.0, 65.0], 'GSM5518964': [1.0, 69.0], 'GSM5518965': [0.0, 69.0], 'GSM5518966': [0.0, 66.0], 'GSM5518967': [0.0, 41.0], 'GSM5518968': [1.0, 69.0], 'GSM5518969': [0.0, 69.0], 'GSM5518970': [0.0, 41.0], 'GSM5518971': [0.0, 58.0], 'GSM5518972': 

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row IDs (gene or probe identifiers) for future observation.
print(gene_data.index[:20])


Index(['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st',
       '2827995_st', '2827996_st', '2828010_st', '2828012_st', '2835442_st',
       '2835447_st', '2835453_st', '2835456_st', '2835459_st', '2835461_st',
       '2839509_st', '2839511_st', '2839513_st', '2839515_st', '2839517_st'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Analyzing the gene identifiers in the dataset
# The identifiers like '2824546_st' appear to be Affymetrix probe IDs, not human gene symbols
# These identifiers need to be mapped to standard gene symbols for biological interpretation

# Concluding with the required format
requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['11869', '29554', '69091', '160446', '317811'], 'stop': ['14409', '31109', '70008', '161525', '328581'], 'total_probes': [49.0, 60.0, 30.0, 30.0, 191.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST0000040838

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Let's obtain proper gene mapping information
print("Extracting gene mapping information for platform GPL17586...")

# For Affymetrix platforms, the annotation data is often in the 'gene_assignment' field
# Let's extract this from the SOFT file more systematically
probe_to_gene_mapping = {}

with gzip.open(soft_file, 'rt', encoding='utf-8') as f:
    current_probe = None
    for line in f:
        line = line.strip()
        
        # Identify probe lines
        if line.startswith('^'):
            parts = line[1:].split()
            if parts and (parts[0].endswith('_st') or parts[0].endswith('_at')):
                current_probe = parts[0]
        
        # Extract gene assignment for current probe
        elif current_probe and line.startswith('!Sample_platform_id') and 'GPL17586' in line:
            # Confirm we're in the right platform section
            pass
        elif current_probe and line.startswith('gene_assignment'):
            try:
                gene_info = line.split('=', 1)[1].strip()
                probe_to_gene_mapping[current_probe] = gene_info
            except:
                pass
            current_probe = None

print(f"Extracted {len(probe_to_gene_mapping)} probe-to-gene mappings")

# If extraction failed from SOFT file, look for any platform annotation files
if len(probe_to_gene_mapping) < 1000:  # If we got too few mappings
    print("Looking for platform annotation files...")
    
    # Create a synthetic mapping as a fallback, but ensure gene symbols get extracted properly
    platform_files = [f for f in os.listdir(in_cohort_dir) if 'platform' in f.lower() or 'gpl17586' in f.lower()]
    
    if platform_files:
        platform_file_path = os.path.join(in_cohort_dir, platform_files[0])
        print(f"Found platform file: {platform_file_path}")
        
        try:
            platform_df = pd.read_csv(platform_file_path, sep='\t', comment='#')
            if 'ID' in platform_df.columns and 'Gene Symbol' in platform_df.columns:
                mapping_data = platform_df[['ID', 'Gene Symbol']].rename(columns={'Gene Symbol': 'Gene'})
                print(f"Successfully extracted {len(mapping_data)} mappings from platform file")
            else:
                # Try to find the right columns
                possible_id_cols = [col for col in platform_df.columns if 'id' in col.lower() or 'probe' in col.lower()]
                possible_gene_cols = [col for col in platform_df.columns if 'gene' in col.lower() and 'symbol' in col.lower()]
                
                if possible_id_cols and possible_gene_cols:
                    mapping_data = platform_df[[possible_id_cols[0], possible_gene_cols[0]]].rename(
                        columns={possible_id_cols[0]: 'ID', possible_gene_cols[0]: 'Gene'})
                    print(f"Extracted {len(mapping_data)} mappings from detected columns")
        except:
            print("Failed to parse platform file")

# If all extraction methods failed, create mapping based on probe IDs in gene_data
if len(probe_to_gene_mapping) < 1000:
    print("Using probes from gene_data as mapping...")
    # Create mapping dictionary with probe IDs
    mapping_data = pd.DataFrame({
        'ID': gene_data.index,
        'Gene': gene_data.index  # Will be processed by extract_human_gene_symbols
    })
else:
    # Convert the mapping dictionary to a DataFrame
    mapping_data = pd.DataFrame({
        'ID': list(probe_to_gene_mapping.keys()),
        'Gene': list(probe_to_gene_mapping.values())
    })

# Ensure the ID column is string type
mapping_data = mapping_data.astype({'ID': 'str'})

# Preview the mapping data
print("\nMapping data preview:")
print(mapping_data.head())

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_data)

# Normalize gene symbols to ensure consistency
gene_data = normalize_gene_symbols_in_index(gene_data)

# Preview the resulting gene expression data
print("\nGene expression data after mapping:")
print(f"Shape: {gene_data.shape}")
print(f"Number of unique genes: {len(gene_data.index)}")
if len(gene_data) > 0:
    print(f"Sample of gene symbols: {gene_data.index[:10].tolist()}")
else:
    print("No genes were mapped. The gene expression dataframe is empty.")

Extracting gene mapping information for platform GPL17586...


Extracted 0 probe-to-gene mappings
Looking for platform annotation files...
Found platform file: ../../input/GEO/Type_2_Diabetes/GSE182121/GSE182121-GPL17586_series_matrix.txt.gz
Failed to parse platform file
Using probes from gene_data as mapping...

Mapping data preview:
           ID        Gene
0  2824546_st  2824546_st
1  2824549_st  2824549_st
2  2824551_st  2824551_st
3  2824554_st  2824554_st
4  2827992_st  2827992_st



Gene expression data after mapping:
Shape: (0, 49)
Number of unique genes: 0
No genes were mapped. The gene expression dataframe is empty.
