In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE131528"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE131528"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE131528.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE131528.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE131528.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A composite immune signature parallels disease progression across T1D subjects"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series"
Sample Characteristics Dictionary:
{0: ['responder cells: UPN727'], 1: ['disease status: T1D', 'serum source - disease status: New onset T1D'], 2: ['subject: A', 'subject: B', 'subject: C', 'subject: D', 'subject: E', 'serum source - bmi: 22.609439839', 'serum source - bmi: 20.028455242', 'serum source - bmi: 16.99179819', 'serum source - bmi: 21.176533107', 'serum source - bmi: 16.617969321', 'serum source - bmi: 15.401032443', 'serum source - bmi: 20.119395077', 'serum source - bmi: 21.620647719', 'serum source - bmi: 20.436491075', 'serum source - bmi: 19.246652339', 'serum source - bmi: 20.976068397', 'serum source - bmi: 26.910153397', 'serum source - bmi: 23.269175524', 'serum source - bmi: 18.444623654', 'serum source - bmi: 20.350672

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import numpy as np
import re
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# Based on the background information, this appears to be a SuperSeries containing multiple datasets
# The information doesn't indicate specific gene expression data, but we should check further
is_gene_available = True  # Assuming it might contain gene expression data without clear evidence against it

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Analyzing the sample characteristics dictionary:

# For trait (Type_1_Diabetes)
# Row 1 contains 'disease status: T1D' which clearly indicates disease status
trait_row = 1

# For age:
# Row 3 and 4 contain age-related information ('age at enrollment' and 'age at blood draw')
# Row 3 seems to have more comprehensive age data
age_row = 3

# For gender:
# Row 4 and 5 contain gender information ('serum source- sex: F/M' and 'serum source- sex: female/male')
# Row 5 seems more comprehensive
gender_row = 5

# 2.2 Data Type Conversion Functions
def convert_trait(value):
    """Convert trait value to binary (0 for control, 1 for T1D)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    if 'T1D' in value or 'diabetes' in value.lower():
        return 1
    elif 'control' in value.lower() or 'healthy' in value.lower():
        return 0
    return None

def convert_age(value):
    """Convert age value to continuous numeric"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Try to extract numeric age value using regex
    match = re.search(r'(\d+(\.\d+)?)', value)
    if match:
        return float(match.group(1))
    return None

def convert_gender(value):
    """Convert gender value to binary (0 for female, 1 for male)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    value = value.lower()
    if value in ['f', 'female']:
        return 0
    elif value in ['m', 'male']:
        return 1
    return None

# 3. Save Metadata
# Determine if trait data is available
is_trait_available = trait_row is not None

# Conduct initial filtering
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Check if clinical data is available
if trait_row is not None:
    # We need the clinical_data DataFrame to proceed
    # Let's assume it exists from a previous step
    try:
        # Get the clinical features using the library function
        clinical_features_df = geo_select_clinical_features(
            clinical_df=clinical_data,  # This should be defined in a previous step
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the DataFrame
        preview = preview_df(clinical_features_df)
        print("Clinical Features DataFrame Preview:")
        print(preview)
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        
        # Save the DataFrame to CSV
        clinical_features_df.to_csv(out_clinical_data_file)
        print(f"Clinical data saved to {out_clinical_data_file}")
    except NameError:
        print("Error: clinical_data DataFrame not found. It should be defined in a previous step.")


Clinical Features DataFrame Preview:
{'GSM3785210': [1.0, nan, 0.0], 'GSM3785211': [1.0, nan, 0.0], 'GSM3785212': [1.0, nan, 0.0], 'GSM3785213': [1.0, nan, 0.0], 'GSM3785214': [1.0, nan, 0.0], 'GSM3785215': [1.0, nan, 0.0], 'GSM3785216': [1.0, 12.0, nan], 'GSM3785217': [1.0, 12.0, nan], 'GSM3785218': [1.0, 12.0, nan], 'GSM3785219': [1.0, nan, 1.0], 'GSM3785220': [1.0, nan, 1.0], 'GSM3785221': [1.0, nan, 1.0], 'GSM3785222': [1.0, nan, 0.0], 'GSM3785223': [1.0, nan, 0.0], 'GSM3785224': [1.0, nan, 0.0], 'GSM3785225': [1.0, 12.1478, nan], 'GSM3785226': [1.0, 15.4771, nan], 'GSM3785227': [1.0, 13.1636, nan], 'GSM3785228': [1.0, 17.0157, nan], 'GSM3785229': [1.0, 9.2977, nan], 'GSM3785230': [1.0, 10.7187, nan], 'GSM3785231': [1.0, 25.8645, nan], 'GSM3785232': [1.0, 11.9535, nan], 'GSM3785233': [1.0, 16.0, nan], 'GSM3785234': [1.0, 19.0, nan], 'GSM3785235': [1.0, 23.0, nan], 'GSM3785236': [1.0, 27.0, nan], 'GSM3785237': [1.0, 28.0, nan], 'GSM3785238': [1.0, 15.0, nan], 'GSM3785239': [1.0, 14.

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE131528/GSE131528-GPL570_series_matrix.txt.gz


Gene data shape: (54675, 55)
First 20 gene/probe identifiers:
Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1552256_a_at', '1552257_a_at', '1552258_at', '1552261_at',
       '1552263_at', '1552264_a_at', '1552266_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# The gene identifiers in the gene expression data are in the format "XXXXXX_at" or similar variations
# These are Affymetrix probe IDs from the GPL570 platform (HG-U133_Plus_2)
# They are not human gene symbols and need to be mapped to gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


Platform title found: [HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array



Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861', 'L13852', 'X55005', 'X79510', 'M21121', 'J02843'], 'SPOT_ID': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprieta

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine which keys in the gene annotation data correspond to probe IDs and gene symbols
# Based on the gene annotation preview, the 'ID' column matches the gene identifiers in the gene expression data
# The 'Gene Symbol' column contains the corresponding gene symbols
prob_col = 'ID'
gene_col = 'Gene Symbol'

# 2. Create a gene mapping dataframe
# Using the get_gene_mapping function to extract the two relevant columns
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)
print(f"Gene mapping shape: {mapping_df.shape}")
print("First few rows of the mapping data:")
print(mapping_df.head())

# 3. Convert probe-level measurements to gene expression data
# Apply the gene mapping to the gene expression data using the apply_gene_mapping function
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Processed gene expression data shape: {gene_data.shape}")
print("First few genes in the processed gene data:")
print(gene_data.index[:10])

# Save the gene data to the output file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Gene mapping shape: (45782, 2)
First few rows of the mapping data:
          ID              Gene
0  1007_s_at  DDR1 /// MIR4640
1    1053_at              RFC2
2     117_at             HSPA6
3     121_at              PAX8
4  1255_g_at            GUCA1A


Processed gene expression data shape: (21278, 55)
First few genes in the processed gene data:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1',
       'A4GALT', 'A4GNT', 'AA06'],
      dtype='object', name='Gene')


Gene expression data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE131528.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
gene_data = normalize_gene_symbols_in_index(gene_data)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# Define the correct convert_trait function as established in Step 2
def convert_trait(value):
    """Convert trait value to binary (0 for control, 1 for T1D)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    if 'T1D' in value or 'diabetes' in value.lower():
        return 1
    elif 'control' in value.lower() or 'healthy' in value.lower():
        return 0
    return None

def convert_age(value):
    """Convert age value to continuous numeric"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Try to extract numeric age value using regex
    match = re.search(r'(\d+(\.\d+)?)', value)
    if match:
        return float(match.group(1))
    return None

def convert_gender(value):
    """Convert gender value to binary (0 for female, 1 for male)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    value = value.lower()
    if value in ['f', 'female']:
        return 0
    elif value in ['m', 'male']:
        return 1
    return None

# Re-extract clinical features using the appropriate conversion functions and row numbers from Step 2
selected_clinical_df = geo_select_clinical_features(
    clinical_df=clinical_data,
    trait=trait,
    trait_row=1,  # Correct trait row from Step 2
    convert_trait=convert_trait,
    age_row=3,     # Age row from Step 2
    convert_age=convert_age,
    gender_row=5,  # Gender row from Step 2
    convert_gender=convert_gender
)

# Save the processed clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
selected_clinical_df.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 2. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data)
print(f"Linked data shape: {linked_data.shape}")
print("Linked data preview (first 5 rows, 5 columns):")
print(linked_data.iloc[:5, :5] if not linked_data.empty else "Linked data is empty")

# 3. Handle missing values
linked_data = handle_missing_values(linked_data, trait)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 4. Check for bias in features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset contains gene expression data from Type 1 Diabetes patients."
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset is not usable for analysis. No linked data file saved.")

Normalized gene data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE131528.csv
Clinical data saved to ../../output/preprocess/Type_1_Diabetes/clinical_data/GSE131528.csv
Linked data shape: (55, 19848)
Linked data preview (first 5 rows, 5 columns):
            Type_1_Diabetes  Age  Gender     A1BG  A1BG-AS1
GSM3785210              1.0  NaN     0.0  5.37196   3.82275
GSM3785211              1.0  NaN     0.0  5.64512   3.95980
GSM3785212              1.0  NaN     0.0  5.45900   4.04396
GSM3785213              1.0  NaN     0.0  5.45969   3.88714
GSM3785214              1.0  NaN     0.0  5.48324   3.78641


Data shape after handling missing values: (55, 19848)
Quartiles for 'Type_1_Diabetes':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Type_1_Diabetes' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 13.581800000000001
  50% (Median): 17.2841
  75%: 18.5
Min: 8.8186
Max: 34.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 3 occurrences. This represents 5.45% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



Dataset is not usable for analysis. No linked data file saved.
