In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE162622"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE162622"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE162622.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE162622.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE162622.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Probiotic normalization of systemic inflammation in siblings of Type 1 diabetes patients"
!Series_summary	"BACKGROUND.  The incidence of Type 1 Diabetes (T1D) has significantly increased in recent decades and coincides with lifestyle changes that have likely altered the composition of the gut microbiota.  Dysbiosis and gut barrier dysfunction are associated with T1D, and notably, our studies have identified an inflammatory state in T1D families that is consistent microbial antigen exposure.     METHODS.  We conducted a 6-week, single-arm, open-label trial to investigate whether daily multi-strain probiotic (Bifidobacteria, Lactobacillus, and Streptococcus) supplementation could reduce the familial inflammatory state in 25 unaffected siblings of diabetes patients.    RESULTS.  Probiotic supplementation was found safe and well-tolerated; there were no adverse events and participant adherence was 93%.  Bacterial 16S rDNA gene sequencing of stool reve

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import pandas as pd
import os
import json
from typing import Optional, Callable, Dict, Any

# 1. Gene Expression Data Availability
# This dataset appears to be a stimulation experiment where PBMC cells (UPN119) were treated with plasma
# from siblings of T1D patients. It's not directly a gene expression dataset of the subjects.
is_gene_available = False

# 2. Variable Availability and Data Type Conversion
# Based on the sample characteristics, we don't see direct trait, age, or gender information
# From the background info, all subjects are siblings of T1D patients, so they don't have T1D themselves
trait_row = None  # No direct trait data
age_row = None    # No age data
gender_row = None # No gender data

# Define conversion functions (even though we won't use them in this case)
def convert_trait(value: str) -> Optional[int]:
    return None

def convert_age(value: str) -> Optional[float]:
    return None

def convert_gender(value: str) -> Optional[int]:
    return None

# 3. Save Metadata
# Initial filtering on dataset usability
is_trait_available = trait_row is not None

validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is None, we don't have clinical data to extract, so we skip this step


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE162622/GSE162622_series_matrix.txt.gz


Gene data shape: (54675, 50)
First 20 gene/probe identifiers:
Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1552256_a_at', '1552257_a_at', '1552258_at', '1552261_at',
       '1552263_at', '1552264_a_at', '1552266_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Examining the gene identifiers
# These appear to be Affymetrix probe IDs (like "1007_s_at") rather than standard human gene symbols
# Affymetrix IDs need to be mapped to gene symbols for interpretable analysis

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


Platform title found: [HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array



Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861', 'L13852', 'X55005', 'X79510', 'M21121', 'J02843'], 'SPOT_ID': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprieta

### Step 6: Gene Identifier Mapping

In [7]:
# Get the gene mapping dataframe by extracting 'ID' and 'Gene Symbol' columns from the gene annotation dataframe
# The 'ID' column in the gene annotation contains the same Affymetrix probe IDs that are in the gene expression data
# The 'Gene Symbol' column contains the human gene symbols we need to map to
gene_mapping = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='Gene Symbol')

# Check the mapping dataframe
print(f"Gene mapping dataframe shape: {gene_mapping.shape}")
print("First few rows of gene mapping:")
print(gene_mapping.head())

# Apply the gene mapping to convert probe-level measurements to gene-level expression
gene_data = apply_gene_mapping(gene_data, gene_mapping)

# Check the resulting gene expression dataframe
print(f"Gene expression dataframe shape after mapping: {gene_data.shape}")
print("First few genes and samples:")
print(gene_data.iloc[:5, :5])


Gene mapping dataframe shape: (45782, 2)
First few rows of gene mapping:
          ID              Gene
0  1007_s_at  DDR1 /// MIR4640
1    1053_at              RFC2
2     117_at             HSPA6
3     121_at              PAX8
4  1255_g_at            GUCA1A


Gene expression dataframe shape after mapping: (21278, 50)
First few genes and samples:
          GSM4955523  GSM4955524  GSM4955525  GSM4955526  GSM4955527
Gene                                                                
A1BG         4.76290     4.56200     4.42976     4.34903     4.65943
A1BG-AS1     3.48629     3.62528     3.10284     3.50675     3.21579
A1CF         6.34753     6.16517     6.58335     6.14196     5.91829
A2M          7.97211     7.30928     7.74898     6.64040     7.87064
A2M-AS1      6.58995     5.82725     5.57752     6.33873     6.25530


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
gene_data = normalize_gene_symbols_in_index(gene_data)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# Based on Step 2 analysis, we determined this dataset doesn't have trait information
# and is not suitable for our Type 1 Diabetes analysis
# The dataset is a stimulation experiment using UPN119 cells treated with plasma
# from siblings of T1D patients, not direct gene expression data from Type 1 Diabetes patients

# Create a minimal dataframe with gene data structure for validation purposes
df_for_validation = pd.DataFrame(gene_data.iloc[:5, :5])
# Add a dummy trait column since the validation function expects trait data
df_for_validation[trait] = 0

# Since we have no trait data, we'll mark this dataset as biased for our purpose
is_biased = True

# Validate and record that this dataset is not usable for our analysis
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,  # Gene expression data is available
    is_trait_available=False,  # No trait data for Type 1 Diabetes
    is_biased=is_biased,  # Mark as biased since there's no relevant trait data
    df=df_for_validation,
    note="Dataset contains gene expression data from UPN119 cells stimulated with plasma from siblings of T1D patients, not directly from T1D patients."
)

print("Dataset is not usable for Type 1 Diabetes trait analysis because it lacks relevant clinical data.")
print("Gene expression data has been saved, but no linked data will be produced.")

Normalized gene data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE162622.csv
Dataset is not usable for Type 1 Diabetes trait analysis because it lacks relevant clinical data.
Gene expression data has been saved, but no linked data will be produced.
