In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE193273"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE193273"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE193273.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE193273.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE193273.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
# Step 1: Assess gene expression data availability
# Based on background information, this is a gene expression study on PBMC
# The Series_overall_design mentions "Whole genome expression analysis"
is_gene_available = True

# Step 2: Identify variable availability and create conversion functions

# 2.1 & 2.2: Trait data
# From the sample characteristics dictionary, we can see the diagnosis information in row 1
# Values are 'healthy control' and 'recent-onset T1D'
trait_row = 1

def convert_trait(value):
    if 'diagnosis:' not in value:
        return None
    
    value = value.split('diagnosis:')[1].strip().lower()
    if 'recent-onset t1d' in value:
        return 1  # Has Type 1 Diabetes
    elif 'healthy control' in value:
        return 0  # Doesn't have Type 1 Diabetes
    else:
        return None

# Age information is not available in the sample characteristics
age_row = None

def convert_age(value):
    # Function definition required but will not be used
    return None

# Gender information is not available in the sample characteristics
gender_row = None

def convert_gender(value):
    # Function definition required but will not be used
    return None

# Step 3: Save metadata
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# Step 4: Extract clinical features if trait_row is not None
if trait_row is not None:
    try:
        # Access the clinical data from the previous step
        # Use clinical_data that is expected to be available from a previous step
        # If we need to reconstruct it from the Sample Characteristics Dictionary:
        sample_chars = {0: ['cell type: PBMC'], 1: ['diagnosis: healthy control', 'diagnosis: recent-onset T1D']}
        
        # Convert the dictionary to a DataFrame format similar to what we'd expect from clinical data
        data = {}
        for key, values in sample_chars.items():
            for val in values:
                data[key] = values
        
        clinical_data = pd.DataFrame.from_dict(data, orient='index')
        
        # Extract clinical features
        clinical_features = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the resulting dataframe
        preview = preview_df(clinical_features)
        print("Preview of clinical features:")
        print(preview)
        
        # Save the clinical features to a CSV file
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_features.to_csv(out_clinical_data_file)
        print(f"Clinical features saved to {out_clinical_data_file}")
    
    except Exception as e:
        print(f"Error processing clinical data: {e}")
        print("Continuing execution despite clinical data processing error.")


### Step 3: Gene Data Extraction

In [None]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


### Step 4: Gene Identifier Review

In [None]:
# Examining the gene identifiers shown in the output
# The identifiers appear to be numeric values ('4', '5', '6', etc.) which are not human gene symbols
# These are likely probe IDs or some other numeric identifiers that need to be mapped to gene symbols

# Conclusion based on biomedical knowledge:
requires_gene_mapping = True


### Step 5: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


### Step 6: Gene Identifier Mapping

In [None]:
# Examining the gene annotation dataframe and gene expression data
# From previous outputs:
# - Gene expression data has numeric index values like "4", "5"...
# - Gene annotation dataframe has "ID" that contains these numeric values and "GENE_SYMBOL" with gene names

# 1. Identify relevant columns in the gene annotation for mapping
# The "ID" column contains the probe identifiers that match the index of the gene expression data
# The "GENE_SYMBOL" column contains the gene symbols we want to map to
probe_col = "ID"
gene_col = "GENE_SYMBOL"

# 2. Extract mapping dataframe with the two relevant columns
mapping_df = get_gene_mapping(gene_annotation, probe_col, gene_col)
print(f"Created mapping dataframe with shape: {mapping_df.shape}")
print("Sample of mapping data:")
print(mapping_df.head())

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Created gene expression dataframe with shape: {gene_data.shape}")
print("Sample of gene expression data:")
print(gene_data.head())

# Normalize gene symbols (to handle synonyms and variant representations)
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"After normalizing gene symbols, gene data shape: {gene_data.shape}")
print("Sample of normalized gene expression data:")
print(gene_data.head())

# Save the gene data to a CSV file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


### Step 7: Data Normalization and Linking

In [None]:
# 1. Normalize gene symbols in the gene expression data
# Note: gene_data was already normalized in Step 6, so we don't need to normalize it again

# 2. Link clinical and genetic data
# First, let's examine our data to understand the structure
print("Clinical data structure:")
print(clinical_data.head())
print("\nSelected clinical data:")
print(selected_clinical_df.head())
print("\nGene data columns (first 5):")
print(gene_data.columns[:5])

# The issue is that our clinical data doesn't have the GSM IDs as index
# We need to align the sample IDs between clinical and gene data

# Get the GSM IDs from the clinical data
if '!Sample_geo_accession' in clinical_data.columns:
    gsm_ids = clinical_data['!Sample_geo_accession'].tolist()
    print(f"\nFound {len(gsm_ids)} GSM IDs in clinical data")
    print(f"First 5 GSM IDs: {gsm_ids[:5]}")
    
    # First, let's create a mapping between our trait values and GSM IDs
    trait_values = selected_clinical_df.iloc[0].tolist()  # Get the trait values from the first row
    trait_dict = dict(zip(gsm_ids, trait_values))
    
    # Create a new clinical dataframe with GSM IDs as index and trait as column
    new_clinical_df = pd.DataFrame({trait: trait_dict}, index=gsm_ids)
    print("\nNew clinical dataframe:")
    print(new_clinical_df.head())
    
    # Now ensure we're using only GSM IDs that exist in gene data
    common_samples = list(set(gsm_ids).intersection(set(gene_data.columns)))
    print(f"\nFound {len(common_samples)} common samples between clinical and gene data")
    
    # Filter both dataframes to include only common samples
    new_clinical_df = new_clinical_df.loc[common_samples]
    gene_data_filtered = gene_data[common_samples]
    
    # Now link the data properly
    # Transpose gene data to have samples as rows and genes as columns
    gene_data_t = gene_data_filtered.T
    
    # Add the trait column to the transposed gene data
    linked_data = gene_data_t.copy()
    linked_data[trait] = new_clinical_df[trait].values
    
    print(f"\nLinked data shape: {linked_data.shape}")
    print("Linked data preview (first 5 rows, 5 columns):")
    print(linked_data.iloc[:5, :5])
    
else:
    print("ERROR: Could not find GSM IDs in clinical data!")
    # Create a dummy linked_data as a fallback
    linked_data = pd.DataFrame({trait: []})

# 3. Handle missing values
linked_data = handle_missing_values(linked_data, trait)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 4. Check for bias in features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset contains gene expression data from peripheral blood mononuclear cells of recent-onset Type 1 Diabetes patients and healthy controls."
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset is not usable for analysis. No linked data file saved.")


### Step 8: Data Normalization and Linking

In [None]:
# 1. Normalize gene symbols in the gene expression data
# Note: gene_data was already normalized in Step 6, so we don't need to normalize it again

# 2. Link clinical and genetic data
# Load the clinical data file that was saved in a previous step
try:
    clinical_features = pd.read_csv(out_clinical_data_file, index_col=0)
    print("Loaded clinical features from file")
    print(clinical_features.head())
except Exception as e:
    print(f"Error loading clinical features: {e}")
    # Re-extract clinical features from scratch if needed
    print("Extracting clinical features from scratch")
    
    # Get the SOFT and matrix file paths
    soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
    
    # Get background info and clinical data
    background_info, clinical_data = get_background_and_clinical_data(matrix_file)
    
    # Define conversion function for trait
    def convert_trait(value):
        if 'diagnosis:' not in value:
            return None
        
        value = value.split('diagnosis:')[1].strip().lower()
        if 'recent-onset t1d' in value:
            return 1  # Has Type 1 Diabetes
        elif 'healthy control' in value:
            return 0  # Doesn't have Type 1 Diabetes
        else:
            return None
    
    # Extract clinical features
    clinical_features = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=1,  # Correct trait row from Step 2
        convert_trait=convert_trait,
        age_row=None,  # Age row not available
        convert_age=None,
        gender_row=None,  # Gender row not available
        convert_gender=None
    )
    
    # Save the clinical data
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_features.to_csv(out_clinical_data_file)
    print(f"Clinical features saved to {out_clinical_data_file}")

# Get GSM IDs from clinical data
gsm_ids = []
if '!Sample_geo_accession' in clinical_data.columns:
    gsm_ids = clinical_data['!Sample_geo_accession'].tolist()
    print(f"Found {len(gsm_ids)} GSM IDs in clinical data")
    print(f"First 5 GSM IDs: {gsm_ids[:5]}")
else:
    # If not found in clinical_data, use column names from gene_data
    gsm_ids = gene_data.columns.tolist()
    print(f"Using GSM IDs from gene_data columns: {len(gsm_ids)} found")
    print(f"First 5 GSM IDs: {gsm_ids[:5]}")

# Get trait values from clinical features
trait_values = clinical_features.iloc[0].values if not clinical_features.empty else []
print(f"Trait values: {trait_values}")

# Create trait dictionary mapping GSM IDs to trait values
trait_dict = {}
if len(gsm_ids) == len(trait_values):
    trait_dict = dict(zip(gsm_ids, trait_values))
else:
    print(f"Warning: GSM IDs count ({len(gsm_ids)}) doesn't match trait values count ({len(trait_values)})")
    # Use positions to map if exact matching is not possible
    for i, gsm_id in enumerate(gsm_ids):
        if i < len(trait_values):
            trait_dict[gsm_id] = trait_values[i]

# Create linked data with GSM IDs as index and trait column
linked_data = pd.DataFrame(index=gsm_ids)
linked_data[trait] = pd.Series(trait_dict)
print(f"Created linked_data with trait column: {linked_data.shape}")

# Add gene expression data (genes as columns)
gene_data_t = gene_data.T
print(f"Transposed gene data shape: {gene_data_t.shape}")

# Ensure gene_data_t index matches linked_data index
common_samples = list(set(linked_data.index).intersection(set(gene_data_t.index)))
print(f"Found {len(common_samples)} common samples")

# Filter both to include only common samples
linked_data = linked_data.loc[common_samples]
gene_data_t = gene_data_t.loc[common_samples]

# Combine the data frames
for gene in gene_data_t.columns:
    linked_data[gene] = gene_data_t[gene]

print(f"Linked data final shape: {linked_data.shape}")
print(linked_data.head(3))

# 3. Handle missing values
linked_data = handle_missing_values(linked_data, trait)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 4. Check for bias in features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset contains gene expression data from peripheral blood mononuclear cells of recent-onset Type 1 Diabetes patients and healthy controls."
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset is not usable for analysis. No linked data file saved.")