In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE123088"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE123088"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE123088.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE123088.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE123088.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import os
import json
from typing import Callable, Optional, Dict, Any, List, Union

# 1. Determine gene expression data availability
# This is a SuperSeries, but just being a SuperSeries doesn't guarantee gene expression data
# Let's be conservative and set it to False since we can't confirm gene expression data
is_gene_available = False

# Check if clinical data file exists
clinical_data_path = os.path.join(in_cohort_dir, 'clinical_data.csv')
clinical_data_exists = os.path.exists(clinical_data_path)

# If clinical data doesn't exist, trait data is not available
is_trait_available = clinical_data_exists

# 2.1 Data Availability
# Since clinical data doesn't exist, these are hypothetical and won't be used
trait_row = 1 if clinical_data_exists else None
age_row = 3 if clinical_data_exists else None
gender_row = 2 if clinical_data_exists else None

# 2.2 Data Type Conversion Functions (for documentation purposes)
def convert_trait(value: str) -> int:
    """Convert trait value to binary (0 or 1)."""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    # Convert to binary based on Type 1 Diabetes diagnosis
    if "TYPE_1_DIABETES" in value.upper() or "T1D" in value.upper():
        return 1
    elif "HEALTHY_CONTROL" in value.upper() or "CONTROL" in value.upper():
        return 0
    else:
        return None  # Other diagnoses not relevant for Type 1 Diabetes study

def convert_age(value: str) -> Union[float, None]:
    """Convert age value to continuous."""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    try:
        return float(value)
    except:
        return None

def convert_gender(value: str) -> Union[int, None]:
    """Convert gender value to binary (0 for female, 1 for male)."""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if "FEMALE" in value.upper() or value.upper() == "F":
        return 0
    elif "MALE" in value.upper() or value.upper() == "M":
        return 1
    else:
        return None

# 3. Save Metadata - initial filtering
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction - Only execute if clinical data exists
if clinical_data_exists and trait_row is not None:
    # Load clinical data
    clinical_data = pd.read_csv(clinical_data_path, index_col=0)
    
    # Extract clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical features
    preview = preview_df(selected_clinical_df)
    print("Preview of clinical features:")
    print(preview)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Save clinical features to CSV
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical features saved to {out_clinical_data_file}")
else:
    print(f"Clinical data file not found at {clinical_data_path}. Skipping clinical feature extraction.")


### Step 3: Gene Data Extraction

In [None]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


### Step 4: Gene Identifier Review

In [None]:
# Looking at the gene identifiers, we see they are numbers (1, 2, 3, 9, 10, 12, etc.)
# These are not human gene symbols, which typically look like BRCA1, TP53, IL6, etc.
# These numeric identifiers appear to be probe IDs or some other type of identifier
# that would need to be mapped to standard gene symbols for meaningful analysis.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


### Step 6: Gene Identifier Mapping

In [None]:
# 1. Observe the annotation data to identify relevant columns
print("Column names in gene_annotation:", gene_annotation.columns.tolist())

# The dataset uses Entrez Gene IDs which need special handling
# Check the data types and content in both datasets to understand the mapping issue
print("Gene annotation ID column type:", gene_annotation['ID'].dtype)
print("Gene data index type:", gene_data.index.dtype)

# Check if the IDs in gene_data match those in the annotation
ids_in_gene_data = set(gene_data.index)
ids_in_annotation = set(gene_annotation['ID'].astype(str))
matching_ids = ids_in_gene_data.intersection(ids_in_annotation)
print(f"IDs in gene data: {len(ids_in_gene_data)}")
print(f"IDs in annotation: {len(ids_in_annotation)}")
print(f"Matching IDs: {len(matching_ids)}")

# Prepare a clean mapping table with string IDs for reliable matching
mapping_data = gene_annotation[['ID', 'ENTREZ_GENE_ID']].copy()
mapping_data = mapping_data.rename(columns={'ENTREZ_GENE_ID': 'Gene'})
mapping_data['ID'] = mapping_data['ID'].astype(str)
mapping_data['Gene'] = mapping_data['Gene'].astype(str)
mapping_data = mapping_data.dropna()

print(f"Prepared mapping data shape: {mapping_data.shape}")
print("Sample of mapping data:")
print(mapping_data.head())

# Apply a modified approach to map the gene expression data
# Create a direct map from probe ID to gene ID
probe_to_gene = dict(zip(mapping_data['ID'], mapping_data['Gene']))

# Create a new dataframe with gene IDs as index
gene_expression = pd.DataFrame(index=list(set(mapping_data['Gene'])))

# For each sample column in gene_data, transfer the expression values to corresponding genes
for column in gene_data.columns:
    gene_values = {}
    for probe_id in gene_data.index:
        if probe_id in probe_to_gene:
            gene_id = probe_to_gene[probe_id]
            if gene_id in gene_values:
                gene_values[gene_id] += gene_data.loc[probe_id, column]
            else:
                gene_values[gene_id] = gene_data.loc[probe_id, column]
    
    gene_expression[column] = pd.Series(gene_values)

# Check the result
print(f"Gene expression data shape after mapping: {gene_expression.shape}")
print("First few gene identifiers after mapping:")
if gene_expression.shape[0] > 0:
    print(gene_expression.index[:10])
else:
    print("No gene identifiers found after mapping.")

# Update is_gene_available flag based on the successful mapping
is_gene_available = (gene_expression.shape[0] > 0)
print(f"Gene data available: {is_gene_available}")

# If gene data is available, use the mapped data for further processing
if is_gene_available:
    gene_data = gene_expression
    
    # Save the gene data to file for future use
    os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
    gene_data.to_csv(out_gene_data_file)
    print(f"Gene expression data saved to {out_gene_data_file}")
else:
    print("No gene data available after mapping. Skipping gene data save.")


### Step 7: Data Normalization and Linking

In [None]:
# 1. Normalize gene symbols in the gene expression data
gene_data = normalize_gene_symbols_in_index(gene_data)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# Define the correct convert_trait function as established in Step 2
def convert_trait(value):
    """Convert trait value to binary (0 for control, 1 for T1D)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    if 'T1D' in value or 'diabetes' in value.lower():
        return 1
    elif 'control' in value.lower() or 'healthy' in value.lower():
        return 0
    return None

def convert_age(value):
    """Convert age value to continuous numeric"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Try to extract numeric age value using regex
    match = re.search(r'(\d+(\.\d+)?)', value)
    if match:
        return float(match.group(1))
    return None

def convert_gender(value):
    """Convert gender value to binary (0 for female, 1 for male)"""
    if pd.isna(value):
        return None
    
    # Extract the value after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Convert to binary
    value = value.lower()
    if value in ['f', 'female']:
        return 0
    elif value in ['m', 'male']:
        return 1
    return None

# Re-extract clinical features using the appropriate conversion functions and row numbers from Step 2
selected_clinical_df = geo_select_clinical_features(
    clinical_df=clinical_data,
    trait=trait,
    trait_row=1,  # Correct trait row from Step 2
    convert_trait=convert_trait,
    age_row=3,     # Age row from Step 2
    convert_age=convert_age,
    gender_row=5,  # Gender row from Step 2
    convert_gender=convert_gender
)

# Save the processed clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
selected_clinical_df.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 2. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(selected_clinical_df, gene_data)
print(f"Linked data shape: {linked_data.shape}")
print("Linked data preview (first 5 rows, 5 columns):")
print(linked_data.iloc[:5, :5] if not linked_data.empty else "Linked data is empty")

# 3. Handle missing values
linked_data = handle_missing_values(linked_data, trait)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 4. Check for bias in features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset contains gene expression data from Type 1 Diabetes patients."
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset is not usable for analysis. No linked data file saved.")