In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE71799"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE71799"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE71799.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE71799.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE71799.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Identification of molecular signatures of cystic fibrosis disease status using plasma-based functional genomics"
!Series_summary	"The complex milieu of inflammatory mediators associated with many diseases is often too dilute to directly measure in the periphery, necessitating development of more sensitive measurements suitable for mechanistic studies, earlier diagnosis, guiding selection of therapy, and monitoring interventions.  Previously, we determined that plasma of recent-onset (RO) Type 1 diabetes (T1D) patients induce a proinflammatory transcriptional signature in fresh peripheral blood mononuclear cells (PBMC) relative to that of unrelated healthy controls (HC).  Here, using an optimized cryopreserved PBMC-based protocol, we compared the signature found between unrelated healthy controls and non-diabetic cystic fibrosis patients possessing Pseudomonas aeruginosa pulmonary tract infection."
!Series_overall_design	"UPN727 cells were stimulat

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Determine gene expression data availability
is_gene_available = True  # Gene expression analysis was performed according to background info

# 2.1 & 2.2 Identify keys and create conversion functions for trait, age, and gender
# This dataset is about cystic fibrosis, not Type_1_Diabetes
# Since we're looking specifically for Type 1 Diabetes data, mark as unavailable
trait_row = None  # No Type 1 Diabetes status information in sample characteristics

# No age information available
age_row = None

# No gender information available
gender_row = None

# Define conversion functions (even though they won't be used in this case)
def convert_trait(value):
    if value is None:
        return None
    value_str = str(value)
    if ':' in value_str:
        value_str = value_str.split(':', 1)[1].strip()
    
    if "diabetes" in value_str.lower() or "t1d" in value_str.lower():
        return 1
    elif "control" in value_str.lower() or "healthy" in value_str.lower() or "hc" in value_str.lower():
        return 0
    else:
        return None

def convert_age(value):
    if value is None:
        return None
    value_str = str(value)
    if ':' in value_str:
        value_str = value_str.split(':', 1)[1].strip()
    
    try:
        # Try to extract numeric portion
        age = float(''.join(c for c in value_str if c.isdigit() or c == '.'))
        return age
    except:
        return None

def convert_gender(value):
    if value is None:
        return None
    value_str = str(value).lower()
    if ':' in value_str:
        value_str = value_str.split(':', 1)[1].strip().lower()
    
    if 'female' in value_str or 'f' in value_str:
        return 0
    elif 'male' in value_str or 'm' in value_str:
        return 1
    else:
        return None

# 3. Save metadata about dataset usability
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Since trait_row is None (no Type 1 Diabetes data available), we skip the clinical feature extraction step


False

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE71799/GSE71799_series_matrix.txt.gz


Gene data shape: (54675, 134)
First 20 gene/probe identifiers:
Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1552256_a_at', '1552257_a_at', '1552258_at', '1552261_at',
       '1552263_at', '1552264_a_at', '1552266_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Based on the identifiers shown (e.g., '1007_s_at', '1053_at'), these appear to be
# Affymetrix probe IDs rather than human gene symbols.
# Affymetrix probe IDs typically have this format and need to be mapped to gene symbols.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


Platform title found: [HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array



Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at', '1316_at', '1320_at', '1405_i_at', '1431_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861', 'L13852', 'X55005', 'X79510', 'M21121', 'J02843'], 'SPOT_ID': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprieta

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns in gene_annotation correspond to probe IDs and gene symbols
# Based on the preview, 'ID' corresponds to probe IDs like '1007_s_at' 
# and 'Gene Symbol' contains the gene symbols like 'DDR1 /// MIR4640'
prob_col = 'ID'
gene_col = 'Gene Symbol'

print(f"Using '{prob_col}' as probe ID column and '{gene_col}' as gene symbol column")

# 2. Get gene mapping dataframe
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)
print(f"Gene mapping shape: {mapping_df.shape}")
print("First few rows of gene mapping:")
print(mapping_df.head())

# 3. Apply gene mapping to convert probe-level data to gene-level expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Gene-level expression data shape: {gene_data.shape}")
print("First few gene symbols with their expression values:")
print(gene_data.iloc[:5, :3])  # Show first 5 genes, first 3 samples


Using 'ID' as probe ID column and 'Gene Symbol' as gene symbol column


Gene mapping shape: (45782, 2)
First few rows of gene mapping:
          ID              Gene
0  1007_s_at  DDR1 /// MIR4640
1    1053_at              RFC2
2     117_at             HSPA6
3     121_at              PAX8
4  1255_g_at            GUCA1A


Gene-level expression data shape: (21278, 134)
First few gene symbols with their expression values:
          GSM1845886  GSM1845887  GSM1845888
Gene                                        
A1BG         4.89781     5.10467     4.84837
A1BG-AS1     3.52356     4.29494     3.88388
A1CF         5.75930     5.74441     5.66063
A2M          5.27707     5.52795     5.21202
A2M-AS1      4.92566     5.22810     4.97152


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
gene_data = normalize_gene_symbols_in_index(gene_data)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# Create a minimal valid dataframe for validation
# Must have at least one row and the trait column
placeholder_df = pd.DataFrame({trait: [None]})

# Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=False,
    is_biased=False,  # Default value when trait is not available
    df=placeholder_df,  # Placeholder with required structure
    note="Dataset contains gene expression data from cystic fibrosis patients, not Type 1 Diabetes. No clinical Type 1 Diabetes data is available."
)

print("Dataset is not usable for Type_1_Diabetes analysis. No linked data file saved.")

Normalized gene data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE71799.csv
Abnormality detected in the cohort: GSE71799. Preprocessing failed.
Dataset is not usable for Type_1_Diabetes analysis. No linked data file saved.
