In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE232310"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE232310"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE232310.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE232310.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE232310.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Monocytes in Type 1 diabetes families exhibit high cytolytic activity and subset abundances that correlate with clinical progression"
!Series_summary	"Monocytes are immune regulators implicated in the pathogenesis of Type 1 diabetes (T1D), an autoimmune disease that targets the insulin-producing pancreatic β-cell.  We determined that monocytes of recent onset (RO) T1D patients and their healthy siblings express proinflammatory/cytolytic transcriptomes and hyper-secrete cytokines in response to lipopolysaccharide exposure compared to unrelated healthy controls (uHC).  Flow cytometry measured elevated circulating abundances of intermediate monocytes and >2-fold more CD14+CD16+HLADR+KLRD1+PRF1+ NK-like monocytes among ROT1D patients compared to uHC.  The intermediate to nonclassical monocyte ratio among ROT1D patients correlated with the decline in functional β-cell mass during the first 24 months post-onset.  Among sibling non-progressors, flow cyto

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
from typing import Dict, Any, Optional, Callable

# 1. Determine gene expression data availability
# From the background information, we see this dataset contains RNA extracted from monocytes,
# which suggests gene expression data, not just miRNA or methylation
is_gene_available = True

# 2. Variable availability and data type conversion
# 2.1 Data Availability
# From the sample characteristics dictionary, we need to identify the rows containing trait, age, and gender

# For the trait (Type 1 Diabetes), the information is in row 1 under 'sampletype'
trait_row = 1  # Contains T1D status information

# Age is not available in the sample characteristics
age_row = None  # No age information provided

# Gender is not available in the sample characteristics
gender_row = None  # No gender information provided

# 2.2 Data Type Conversion
def convert_trait(value: str) -> int:
    """
    Convert trait value to binary format: 1 for T1D cases, 0 for controls.
    """
    if not isinstance(value, str):
        return None
    
    # Extract value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert value based on sample type
    if 'ROT1D' in value or 'recent onsite of T1D' in value:
        return 1  # T1D case
    elif any(control in value for control in ['uHC', 'HRS', 'LRS', 'healthy']):
        return 0  # Control
    else:
        return None  # Unknown

def convert_age(value: str) -> Optional[float]:
    """
    Placeholder function since age data is not available.
    """
    return None

def convert_gender(value: str) -> Optional[int]:
    """
    Placeholder function since gender data is not available.
    """
    return None

# 3. Save metadata - initial filtering on usability
# Trait data is available since trait_row is not None
is_trait_available = trait_row is not None

validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if trait_row is not None:
    try:
        # Assuming clinical_data is a variable available from previous steps
        # (not trying to load from file as it doesn't exist)
        selected_clinical_df = geo_select_clinical_features(
            clinical_df=clinical_data,  # Using the variable directly from previous steps
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the extracted clinical features
        preview = preview_df(selected_clinical_df)
        print("Preview of extracted clinical features:")
        print(preview)
        
        # Ensure output directory exists
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        
        # Save clinical data to CSV
        selected_clinical_df.to_csv(out_clinical_data_file)
        print(f"Clinical data saved to: {out_clinical_data_file}")
    except Exception as e:
        print(f"Error processing clinical data: {e}")
        print("Unable to extract clinical features. Please ensure clinical_data variable is available.")
else:
    print("Clinical data not available, skipping feature extraction.")


Preview of extracted clinical features:
{'GSM7325164': [0.0], 'GSM7325165': [0.0], 'GSM7325166': [0.0], 'GSM7325167': [0.0], 'GSM7325168': [0.0], 'GSM7325169': [0.0], 'GSM7325170': [0.0], 'GSM7325171': [0.0], 'GSM7325172': [0.0], 'GSM7325173': [0.0], 'GSM7325174': [0.0], 'GSM7325175': [0.0], 'GSM7325176': [0.0], 'GSM7325177': [0.0], 'GSM7325178': [0.0], 'GSM7325179': [0.0], 'GSM7325180': [0.0], 'GSM7325181': [0.0], 'GSM7325182': [0.0], 'GSM7325183': [0.0], 'GSM7325184': [0.0], 'GSM7325185': [0.0], 'GSM7325186': [0.0], 'GSM7325187': [0.0], 'GSM7325188': [0.0], 'GSM7325189': [0.0], 'GSM7325190': [0.0], 'GSM7325191': [0.0], 'GSM7325192': [0.0], 'GSM7325193': [0.0], 'GSM7325194': [0.0], 'GSM7325195': [0.0], 'GSM7325196': [0.0], 'GSM7325197': [0.0], 'GSM7325198': [0.0], 'GSM7325199': [0.0], 'GSM7325200': [0.0], 'GSM7325201': [0.0], 'GSM7325202': [0.0], 'GSM7325203': [0.0], 'GSM7325204': [0.0], 'GSM7325205': [0.0], 'GSM7325206': [0.0], 'GSM7325207': [0.0], 'GSM7325208': [0.0], 'GSM7325209': 

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE232310/GSE232310_series_matrix.txt.gz


Gene data shape: (54675, 62)
First 20 gene/probe identifiers:
Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1552256_a_at', '1552257_a_at', '1552258_at', '1552261_at',
       '1552263_at', '1552264_a_at', '1552266_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Reviewing the gene identifiers in the gene expression data
# These identifiers (e.g., '1007_s_at', '1053_at') appear to be Affymetrix probe IDs, not human gene symbols
# Affymetrix probe IDs typically follow this pattern with underscores and '_at' suffixes
# These need to be mapped to standard human gene symbols for analysis

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


Platform title found: [HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array



Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861', 'L13852', 'X55005', 'X79510', 'M21121', 'J02843'], 'SPOT_ID': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprieta

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns in the gene annotation data correspond to gene IDs and gene symbols
# Based on the annotation preview, we need the 'ID' column (which matches the probe IDs in gene_data)
# and the 'Gene Symbol' column for human gene symbols
gene_id_col = 'ID'
gene_symbol_col = 'Gene Symbol'

print(f"Using {gene_id_col} as probe ID column and {gene_symbol_col} as gene symbol column for mapping")

# 2. Get a gene mapping dataframe by extracting these two columns
gene_mapping = get_gene_mapping(gene_annotation, gene_id_col, gene_symbol_col)
print(f"Created gene mapping dataframe with shape: {gene_mapping.shape}")
print("First few rows of gene mapping:")
print(gene_mapping.head())

# 3. Apply the gene mapping to convert probe-level measurements to gene expression data
# This will handle the many-to-many relationships between probes and genes
gene_data = apply_gene_mapping(gene_data, gene_mapping)
print(f"Converted probe-level data to gene expression data with shape: {gene_data.shape}")
print("First few genes in the expression data:")
print(gene_data.index[:10])

# 4. Normalize gene symbols to handle synonyms and aggregate rows with the same symbol
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"After normalizing gene symbols, shape: {gene_data.shape}")
print("First few normalized genes:")
print(gene_data.index[:10])

# Save the gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to: {out_gene_data_file}")


Using ID as probe ID column and Gene Symbol as gene symbol column for mapping
Created gene mapping dataframe with shape: (45782, 2)
First few rows of gene mapping:
          ID              Gene
0  1007_s_at  DDR1 /// MIR4640
1    1053_at              RFC2
2     117_at             HSPA6
3     121_at              PAX8
4  1255_g_at            GUCA1A


Converted probe-level data to gene expression data with shape: (21278, 62)
First few genes in the expression data:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1',
       'A4GALT', 'A4GNT', 'AA06'],
      dtype='object', name='Gene')
After normalizing gene symbols, shape: (19845, 62)
First few normalized genes:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1',
       'A4GALT', 'A4GNT', 'AA06'],
      dtype='object', name='Gene')


Gene expression data saved to: ../../output/preprocess/Type_1_Diabetes/gene_data/GSE232310.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. We already normalized gene symbols in the previous step, so we don't need to do it again

# 2. Link clinical and genetic data using the appropriate function from the library
# Load the processed clinical data and gene data from the files we saved in previous steps
try:
    clinical_data = pd.read_csv(out_clinical_data_file, index_col=0)
    gene_data = pd.read_csv(out_gene_data_file, index_col=0)
    
    print(f"Loaded clinical data with shape: {clinical_data.shape}")
    print(f"Loaded gene data with shape: {gene_data.shape}")
    
    # Use the proper function to link clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(clinical_data, gene_data)
    print(f"Linked data shape: {linked_data.shape}")
    print("Linked data preview (first 5 rows, 5 columns):")
    print(linked_data.iloc[:5, :5] if not linked_data.empty else "Linked data is empty")
    
    # 3. Handle missing values
    linked_data = handle_missing_values(linked_data, "Type_1_Diabetes")
    print(f"Data shape after handling missing values: {linked_data.shape}")
    
    # 4. Check for bias in features
    is_biased, linked_data = judge_and_remove_biased_features(linked_data, "Type_1_Diabetes")
    
    # 5. Validate and save cohort information
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note="Dataset contains gene expression data from monocytes of patients with Type 1 Diabetes and healthy controls."
    )
    
    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset is not usable for analysis. No linked data file saved.")
        
except Exception as e:
    print(f"An error occurred during data linking and processing: {e}")
    # Ensure metadata is still saved even if processing fails
    validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True, 
        is_trait_available=True,
        is_biased=True,  # Default to biased if we couldn't process properly
        df=pd.DataFrame(),  # Empty dataframe
        note=f"Error during processing: {str(e)}"
    )
    print("Metadata saved with error information.")

Loaded clinical data with shape: (1, 62)
Loaded gene data with shape: (19845, 62)
Linked data shape: (62, 19846)
Linked data preview (first 5 rows, 5 columns):
            Type_1_Diabetes     A1BG  A1BG-AS1     A1CF      A2M
GSM7325164              0.0  5.42892   4.72224  7.72841  7.80370
GSM7325165              0.0  5.39074   4.75294  7.37895  8.95568
GSM7325166              0.0  5.41052   4.60512  7.31539  7.84736
GSM7325167              0.0  5.08744   4.66810  7.92333  7.48599
GSM7325168              0.0  4.76004   4.54256  8.06764  7.59244


Data shape after handling missing values: (62, 19846)
For the feature 'Type_1_Diabetes', the least common label is '1.0' with 14 occurrences. This represents 22.58% of the dataset.
The distribution of the feature 'Type_1_Diabetes' in this dataset is fine.



Linked data saved to ../../output/preprocess/Type_1_Diabetes/GSE232310.csv
