In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE250283"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE250283"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE250283.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE250283.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE250283.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptional profiles associated with coronary artery disease in Type 2 diabetes mellitus"
!Series_summary	"Coronary artery disease (CAD) is a common complication of Type 2 diabetes mellitus (T2DM). Understanding the pathogenesis of this complication is essential in both diagnosis and management. Thus, this study aimed to characterize the presence of CAD in T2DM using molecular markers and pathway analyses."
!Series_summary	"Total RNA from peripheral blood mononuclear cells (PBMCs) underwent whole transcriptomic profiling using the Illumina HumanHT-12 v4.0 expression beadchip. Differential gene expression with gene ontogeny analyses was performed, with supporting correlational analyses using weighted correlation network analysis (WGCNA)"
!Series_overall_design	"The study is a sex- and age-frequency matched case-control design comparing 23 unrelated adult Filipinos with T2DM-CAD to 23 controls (DM with CAD)."
Sample Characteristics Dictionary:
{

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Analyzing Gene Expression Data Availability
# Based on the Series_summary information, this dataset contains transcriptomic profiling
# using Illumina HumanHT-12 v4.0 expression beadchip, which indicates gene expression data
is_gene_available = True

# 2. Analyzing Clinical Feature Availability and Data Type Conversion
# 2.1 Identifying keys for trait, age, and gender in the sample characteristics

# For trait (Type 2 Diabetes):
# Key 2 contains "sample group (dm or no dm)" which indicates diabetes status
trait_row = 2

# For gender:
# Key 1 contains gender information
gender_row = 1

# For age:
# No age information is found in the sample characteristics
age_row = None

# 2.2 Data Type Conversion Functions

def convert_trait(value):
    """Convert diabetes status to binary values."""
    if isinstance(value, str):
        if ':' in value:
            value = value.split(':', 1)[1].strip()
        
        if 'DM' in value:
            return 1  # Has diabetes
        elif 'Healthy' in value:
            return 0  # No diabetes
    return None

def convert_gender(value):
    """Convert gender to binary values (Female: 0, Male: 1)."""
    if isinstance(value, str):
        if ':' in value:
            value = value.split(':', 1)[1].strip()
        
        if 'Female' in value:
            return 0
        elif 'Male' in value:
            return 1
    return None

def convert_age(value):
    """Convert age to numeric values."""
    # This function is defined but not used since age_row is None
    if isinstance(value, str):
        if ':' in value:
            value = value.split(':', 1)[1].strip()
        
        try:
            return float(value)
        except ValueError:
            return None
    return None

# 3. Save Metadata
# Trait data is available since trait_row is not None
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features from the clinical data
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the selected clinical data
    print("Preview of selected clinical data:")
    preview_data = preview_df(selected_clinical_df)
    print(preview_data)
    
    # Save the clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of selected clinical data:
{'GSM7976778': [1.0, 0.0], 'GSM7976779': [1.0, 0.0], 'GSM7976780': [1.0, 1.0], 'GSM7976781': [1.0, 1.0], 'GSM7976782': [1.0, 0.0], 'GSM7976783': [1.0, 0.0], 'GSM7976784': [1.0, 0.0], 'GSM7976785': [1.0, 0.0], 'GSM7976786': [1.0, 0.0], 'GSM7976787': [1.0, 0.0], 'GSM7976788': [1.0, 0.0], 'GSM7976789': [1.0, 0.0], 'GSM7976790': [1.0, 0.0], 'GSM7976791': [1.0, 0.0], 'GSM7976792': [1.0, 0.0], 'GSM7976793': [1.0, 1.0], 'GSM7976794': [1.0, 1.0], 'GSM7976795': [1.0, 0.0], 'GSM7976796': [1.0, 1.0], 'GSM7976797': [1.0, 1.0], 'GSM7976798': [1.0, 0.0], 'GSM7976799': [1.0, 1.0], 'GSM7976800': [1.0, 1.0], 'GSM7976801': [1.0, 0.0], 'GSM7976802': [1.0, 0.0], 'GSM7976803': [1.0, 0.0], 'GSM7976804': [1.0, 0.0], 'GSM7976805': [0.0, 0.0], 'GSM7976806': [1.0, 1.0], 'GSM7976807': [1.0, 1.0], 'GSM7976808': [0.0, 1.0], 'GSM7976809': [0.0, 0.0], 'GSM7976810': [0.0, 0.0], 'GSM7976811': [0.0, 0.0], 'GSM7976812': [0.0, 0.0], 'GSM7976813': [0.0, 0.0], 'GSM7976814': [1.0, 1.0], 'G

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row IDs (gene or probe identifiers) for future observation.
print(gene_data.index[:20])


Index(['ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209', 'ILMN_1651221',
       'ILMN_1651228', 'ILMN_1651229', 'ILMN_1651230', 'ILMN_1651232',
       'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253',
       'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651268', 'ILMN_1651278',
       'ILMN_1651279', 'ILMN_1651281', 'ILMN_1651282', 'ILMN_1651285'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# These identifiers (ILMN_*) are Illumina probe IDs, not human gene symbols
# They are from Illumina microarray platforms and need to be mapped to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_1343061', 'ILMN_1343291', 'ILMN_1343295', 'ILMN_1343321', 'ILMN_1343339'], 'ARRAY_ADDRESS_ID': ['2900397', '3450719', '4490161', '5390750', '4780100'], 'TRANSCRIPT': ['ILMN_160461', 'ILMN_137991', 'ILMN_137405', 'ILMN_160027', 'ILMN_160401'], 'ILMN_GENE': ['CY3_HYB:HIGH_1_MM2', 'EEF1A1', 'GAPDH', 'NEGATIVE_0971', 'NEGATIVE_0953'], 'PA_Call': [1.0, 1.0, 1.0, 0.0, 0.0], 'TARGETID': ['CY3_HYB:HIGH_1_MM2', 'EEF1A1', 'GAPDH', 'NEGATIVE_0971', 'NEGATIVE_0953'], 'SPECIES': ['ILMN Controls', 'Homo sapiens', 'Homo sapiens', 'ILMN Controls', 'ILMN Controls'], 'SOURCE': ['ILMN_Controls', 'RefSeq', 'RefSeq', 'ILMN_Controls', 'ILMN_Controls'], 'SEARCH_KEY': ['cy3_hyb:high_1_mm2', 'NM_001402.4', nan, 'negative_0971', 'negative_0953'], 'SOURCE_REFERENCE_ID': ['cy3_hyb:high_1_mm2', 'NM_001402.4', 'NM_002046.2', 'negative_0971', 'negative_0953'], 'REFSEQ_ID': [nan, 'NM_001402.4', 'NM_002046.2', nan, nan], 'UNIGENE_ID': [nan, nan, nan, nan, nan], 'ENTREZ_GENE_ID': 

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Observe the gene identifiers and decide which columns to use for mapping
# The gene expression data has identifiers like 'ILMN_1343295' in the index
# In the gene annotation data, the 'ID' column contains these identifiers
# The 'SYMBOL' column contains the gene symbols we want to map to

# 2. Create a gene mapping dataframe using the two relevant columns
prob_col = 'ID'  # Column containing probe identifiers
gene_col = 'SYMBOL'  # Column containing gene symbols
gene_mapping = get_gene_mapping(gene_annotation, prob_col, gene_col)

print("Gene mapping preview:")
print(preview_df(gene_mapping))

# 3. Convert probe-level measurements to gene-level expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)

print("After mapping, gene expression data shape:", gene_data.shape)
print("First 10 genes after mapping:", gene_data.index[:10].tolist())


Gene mapping preview:
{'ID': ['ILMN_1343061', 'ILMN_1343291', 'ILMN_1343295', 'ILMN_1343321', 'ILMN_1343339'], 'Gene': ['cy3_hyb:high_1_mm2', 'EEF1A1', 'GAPDH', 'negative_0971', 'negative_0953']}
After mapping, gene expression data shape: (19609, 56)
First 10 genes after mapping: ['A1BG', 'A2BP1', 'A2LD1', 'A2M', 'A2ML1', 'A3GALT2', 'A4GALT', 'A4GNT', 'AAA1', 'AAAS']


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
normalized_gene_data.to_csv(out_gene_data_file)

# 2. Link the clinical and genetic data with the 'geo_link_clinical_genetic_data' function from the library.
linked_data = geo_link_clinical_genetic_data(selected_clinical_df, normalized_gene_data)

# 3. Handle missing values in the linked data
linked_data = handle_missing_values(linked_data, trait)

# 4. Determine whether the trait and some demographic features are severely biased, and remove biased features.
is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Conduct quality check and save the cohort information.
is_usable = validate_and_save_cohort_info(True, cohort, json_path, True, True, is_trait_biased, linked_data)

# 6. If the linked data is usable, save it as a CSV file to 'out_data_file'.
if is_usable:
    unbiased_linked_data.to_csv(out_data_file)

For the feature 'Type_2_Diabetes', the least common label is '0.0' with 15 occurrences. This represents 26.79% of the dataset.
The distribution of the feature 'Type_2_Diabetes' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 20 occurrences. This represents 35.71% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

