In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE123086"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE123086"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE123086.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE123086.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE123086.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
import pandas as pd
import os
import numpy as np
import re

# 1. Gene Expression Data Availability
# Based on the background information, this dataset has gene expression microarray data
is_gene_available = True

# 2.1 Data Availability
# Trait is recorded in key 1
trait_row = 1
# Age is recorded in key 3 and key 4
age_row = 3
# Gender is recorded in key 2 and key 3
gender_row = 2

# 2.2 Data Type Conversion Functions
def convert_trait(value):
    """Convert trait values to binary (0 = control, 1 = case)"""
    if pd.isna(value):
        return None
    
    # Extract the value after colon
    match = re.search(r'primary diagnosis:\s*(.*)', value)
    if not match:
        return None
    
    diagnosis = match.group(1).strip().upper()
    
    # Check if it's Type 1 Diabetes
    if diagnosis == "TYPE_1_DIABETES":
        return 1
    elif diagnosis == "HEALTHY_CONTROL":
        return 0
    else:
        return None  # Other diseases are not relevant for this trait

def convert_age(value):
    """Convert age values to continuous numeric values"""
    if pd.isna(value):
        return None
    
    # Check if it's an age entry
    match = re.search(r'age:\s*(\d+)', value)
    if match:
        return int(match.group(1))
    return None

def convert_gender(value):
    """Convert gender values to binary (0 = female, 1 = male)"""
    if pd.isna(value):
        return None
    
    # Check if it's a sex entry
    match = re.search(r'Sex:\s*(.*)', value)
    if match:
        gender = match.group(1).strip().upper()
        if gender == "FEMALE":
            return 0
        elif gender == "MALE":
            return 1
    return None

# 3. Save Metadata
# Determine if trait data is available
is_trait_available = trait_row is not None

# Validate and save cohort info
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# If trait data is available, extract clinical features
if trait_row is not None:
    # Create sample characteristics dictionary from the information provided
    sample_chars = {
        0: ['cell type: CD4+ T cells'], 
        1: ['primary diagnosis: ASTHMA', 'primary diagnosis: ATHEROSCLEROSIS', 'primary diagnosis: BREAST_CANCER', 
            'primary diagnosis: CHRONIC_LYMPHOCYTIC_LEUKEMIA', 'primary diagnosis: CROHN_DISEASE', 
            'primary diagnosis: ATOPIC_ECZEMA', 'primary diagnosis: HEALTHY_CONTROL', 'primary diagnosis: INFLUENZA', 
            'primary diagnosis: OBESITY', 'primary diagnosis: PSORIASIS', 'primary diagnosis: SEASONAL_ALLERGIC_RHINITIS', 
            'primary diagnosis: TYPE_1_DIABETES', 'primary diagnosis: ACUTE_TONSILLITIS', 'primary diagnosis: ULCERATIVE_COLITIS'],
        2: ['Sex: Male', 'diagnosis2: ATOPIC_ECZEMA', 'Sex: Female', 'diagnosis2: ATHEROSCLEROSIS', 
            'diagnosis2: ASTHMA_OBESITY', 'diagnosis2: ASTHMA', 'diagnosis2: ASTMHA_SEASONAL_ALLERGIC_RHINITIS', 
            'diagnosis2: OBESITY'],
        3: ['age: 56', 'Sex: Male', 'age: 20', 'age: 51', 'age: 37', 'age: 61', 'age: 31', 'age: 41', 'age: 80', 
            'age: 53', 'age: 73', 'age: 60', 'age: 76', 'age: 77', 'age: 74', 'age: 69', 'age: 81', 'age: 70', 
            'age: 82', 'age: 67', 'age: 78', 'age: 72', 'age: 66', 'age: 36', 'age: 45', 'age: 65', 'age: 48', 
            'age: 50', 'age: 24', 'age: 42'],
        4: [np.nan, 'age: 63', 'age: 74', 'age: 49', 'age: 60', 'age: 68', 'age: 38', 'age: 16', 'age: 12', 'age: 27']
    }
    
    # Directly pass the sample_chars dictionary as clinical_df
    # This assumes geo_select_clinical_features can handle the dictionary format
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=sample_chars,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical features
    preview = preview_df(selected_clinical_df)
    print(f"Clinical data preview: {preview}")
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Save the extracted clinical features to a CSV file
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical features saved to {out_clinical_data_file}")


### Step 3: Dataset Analysis and Clinical Feature Extraction

In [None]:
I'll fix the syntax and provide a complete, executable solution for the current step:

```python
# 1. Review the dataset information to determine if gene expression data is available
import pandas as pd
import os
import json

# First, check what files are actually available in the input directory
print("Available files in input directory:")
print(os.listdir(in_cohort_dir))

# Load sample characteristics if available
sample_characteristics = {}
sample_characteristics_path = os.path.join(in_cohort_dir, "sample_characteristics.json")
if os.path.exists(sample_characteristics_path):
    with open(sample_characteristics_path, 'r') as f:
        sample_characteristics = json.load(f)
    print("Sample characteristics loaded successfully")
else:
    print("Sample characteristics file not found")

# Load metadata if available for more context
metadata = {}
metadata_path = os.path.join(in_cohort_dir, "metadata.json")
if os.path.exists(metadata_path):
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    print("Metadata loaded successfully")
else:
    print("Metadata file not found")

# Try to load clinical data if available
clinical_data = None
for possible_filename in ["clinical_data.csv", "clinical.csv", "characteristics.csv"]:
    clinical_data_path = os.path.join(in_cohort_dir, possible_filename)
    if os.path.exists(clinical_data_path):
        clinical_data = pd.read_csv(clinical_data_path)
        print(f"Clinical data loaded from {possible_filename}")
        break

# If clinical data wasn't found in CSV, try to find it in other formats
if clinical_data is None and 'clinical_data.txt' in os.listdir(in_cohort_dir):
    clinical_data_path = os.path.join(in_cohort_dir, "clinical_data.txt")
    clinical_data = pd.read_csv(clinical_data_path, sep='\t')
    print("Clinical data loaded from clinical_data.txt")

# Check if any expression data files exist (which would indicate gene expression data availability)
is_gene_available = any(f.endswith('.txt') or f.endswith('.csv') or 'expression' in f or 'matrix' in f 
                        for f in os.listdir(in_cohort_dir) if not f.startswith('clinical') and not f.startswith('metadata'))

# Print sample characteristics to analyze
print("Sample Characteristics Keys and Values:")
for key, values in sample_characteristics.items():
    print(f"Key {key}: {values}")
    print(f"Unique values: {set(values)}")

# Identifying rows for trait, age, and gender
trait_row = None
age_row = None
gender_row = None

# Inspect each key in sample characteristics to identify relevant rows
for key, values in sample_characteristics.items():
    unique_values = set(values)
    str_values = [str(v).lower() for v in unique_values]
    
    # Look for disease status/trait indicators
    if any(['diabetes' in str(v).lower() for v in unique_values]) or \
       any(['t1d' in str(v).lower() for v in unique_values]) or \
       any(['type 1' in str(v).lower() for v in unique_values]) or \
       any(['control' in str(v).lower() for v in unique_values]) or \
       any(['case' in str(v).lower() for v in unique_values]) or \
       any(['patient' in str(v).lower() for v in unique_values]) or \
       any(['status' in str(v).lower() for v in unique_values]):
        trait_row = key
        print(f"Found trait row at key {key} with values: {unique_values}")
    
    # Look for age indicators
    elif any(['age' in str(v).lower() for v in unique_values]) or \
         any(['years' in str(v).lower() for v in unique_values]) or \
         any([str(v).replace('.', '').isdigit() for v in unique_values if str(v)]):
        age_row = key
        print(f"Found age row at key {key} with values: {unique_values}")
    
    # Look for gender indicators
    elif any(['gender' in str(v).lower() for v in unique_values]) or \
         any(['sex' in str(v).lower() for v in unique_values]) or \
         any(['male' in str(v).lower() for v in unique_values]) or \
         any(['female' in str(v).lower() for v in unique_values]) or \
         any(['m' == str(v).lower() or 'f' == str(v).lower() for v in unique_values]):
        gender_row = key
        print(f"Found gender row at key {key} with values: {unique_values}")

# Define conversion functions for trait, age, and gender
def convert_trait(value):
    """Convert trait values to binary (0 for control, 1 for case)"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if isinstance(value, str) and ':' in value:
        value = value.split(':', 1)[1].strip()
    
    value = str(value).lower()
    
    # Map to binary values
    if 'control' in value or 'healthy' in value or 'normal' in value or 'non-diabetic' in value or 'negative' in value:
        return 0
    elif 'case' in value or 'patient' in value or 'diabetes' in value or 't1d' in value or 'type 1' in value or 'positive' in value:
        return 1
    else:
        return None

def convert_age(value):
    """Convert age values to continuous numbers"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if isinstance(value, str) and ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Try to extract age as a number
    try:
        # Handle cases like "X years"
        if 'year' in str(value).lower():
            age_str = ''.join(c for c in str(value) if c.isdigit() or c == '.')
            return float(age_str) if age_str else None
        # Handle direct numeric values
        else:
            return float(value)
    except (ValueError, TypeError):
        return None

def convert_gender(value):
    """Convert gender values to binary (0 for female, 1 for male)"""
    if value is None:
        return None
    
    # Extract value after colon if present
    if isinstance(value, str) and ':' in value:
        value = value.split(':', 1)[1].strip()
    
    value = str(value).lower()
    
    # Map to binary values
    if 'f' == value or 'female' in value or 'woman' in value:
        return 0
    elif 'm' == value or 'male' in value or 'man' in value:
        return 1
    else:
        return None

# Check if trait data is available
is_trait_available = trait_row is not None

print(f"Gene data available: {is_gene_available}")
print(f"Trait data available: {is_trait_available}")
print(f"Trait row: {trait_row}")
print(f"Age row: {age_row}")
print(f"Gender row: {gender_row}")

# Validate and save cohort info
is_usable = validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# If trait data is available and clinical data is loaded, extract clinical features
if is_trait_available and clinical_data is not None:
    clinical_features = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert


### Step 4: Dataset Analysis and Clinical Feature Extraction

In [None]:
I'll implement a more robust code for the current step that handles the data exploration better.

```python
# Load the data files
import pandas as pd
import os
import json
from typing import Optional, Callable, Dict, Any, List
import re
import glob

# Check if the cohort directory exists
if not os.path.exists(in_cohort_dir):
    print(f"Error: Cohort directory does not exist: {in_cohort_dir}")
    print(f"Checking parent directory {in_trait_dir}")
    if os.path.exists(in_trait_dir):
        print(f"Files in {in_trait_dir}:")
        print(os.listdir(in_trait_dir))
    else:
        print(f"Parent directory also does not exist: {in_trait_dir}")
else:
    print(f"Files in {in_cohort_dir}:")
    print(os.listdir(in_cohort_dir))

# Search for clinical data files with various possible names
clinical_file_patterns = [
    os.path.join(in_cohort_dir, "*characteristics*.csv"),
    os.path.join(in_cohort_dir, "*clinical*.csv"),
    os.path.join(in_cohort_dir, "*phenotype*.csv"),
    os.path.join(in_cohort_dir, "*metadata*.csv")
]

clinical_files = []
for pattern in clinical_file_patterns:
    clinical_files.extend(glob.glob(pattern))

clinical_data = None
if clinical_files:
    clinical_data_path = clinical_files[0]
    print(f"Loading clinical data from: {clinical_data_path}")
    clinical_data = pd.read_csv(clinical_data_path)
    print("Clinical data shape:", clinical_data.shape)
else:
    print("No clinical data files found.")

# Search for background information
background_patterns = [
    os.path.join(in_cohort_dir, "*background*.txt"),
    os.path.join(in_cohort_dir, "*info*.txt"),
    os.path.join(in_cohort_dir, "*.soft"),
    os.path.join(in_cohort_dir, "README*")
]

background_files = []
for pattern in background_patterns:
    background_files.extend(glob.glob(pattern))

background_info = ""
if background_files:
    background_path = background_files[0]
    print(f"Loading background information from: {background_path}")
    with open(background_path, 'r') as file:
        background_info = file.read()
    print("Background information snippet:")
    print(background_info[:500] + "..." if len(background_info) > 500 else background_info)
else:
    print("No background information files found.")

# Search for gene expression data files
gene_file_patterns = [
    os.path.join(in_cohort_dir, "*expression*.csv"),
    os.path.join(in_cohort_dir, "*matrix*.csv"),
    os.path.join(in_cohort_dir, "*gene*.csv"),
    os.path.join(in_cohort_dir, "*.txt") # Some gene expression files might be .txt
]

gene_files = []
for pattern in gene_file_patterns:
    gene_files.extend(glob.glob(pattern))

# Exclude files that are clearly not gene expression data
gene_files = [f for f in gene_files if not any(exclude in f.lower() for exclude in 
                                               ["background", "readme", "clinical", "characteristics"])]

is_gene_available = len(gene_files) > 0
if gene_files:
    print(f"Potential gene expression files found: {gene_files}")
else:
    print("No gene expression files found.")

# Initialize trait, age, and gender row identifiers
trait_row = None
age_row = None
gender_row = None

# Analyze clinical data if available
if clinical_data is not None:
    print("\nAnalyzing clinical data to identify trait, age, and gender information:")
    
    # First check column names for clues
    for i, col in enumerate(clinical_data.columns):
        col_lower = str(col).lower()
        
        # Look for trait-related column names
        if any(term in col_lower for term in ['diabetes', 't1d', 'condition', 'disease', 'status', 'diagnosis']):
            trait_row = i
            print(f"Found potential trait column: {col} (index {i})")
            
        # Look for age-related column names
        if any(term in col_lower for term in ['age', 'year']):
            age_row = i
            print(f"Found potential age column: {col} (index {i})")
            
        # Look for gender-related column names
        if any(term in col_lower for term in ['gender', 'sex']):
            gender_row = i
            print(f"Found potential gender column: {col} (index {i})")
    
    # If we couldn't identify columns from names, analyze content
    for idx, column in enumerate(clinical_data.columns):
        unique_vals = clinical_data[column].dropna().unique()
        
        if len(unique_vals) <= 0:
            continue  # Skip empty columns
            
        print(f"\nColumn {idx} unique values:")
        print(unique_vals)
        
        # Convert to string for analysis
        value_str = ' '.join([str(v) for v in unique_vals]).lower()
        
        # Identify trait row (Type 1 Diabetes related)
        if trait_row is None:
            diabetes_terms = ['diabetes', 't1d', 'type 1', 'diabetic', 'case', 'control', 'patient', 'healthy']
            if any(term in value_str for term in diabetes_terms):
                # Make sure it's not just one value (which would be useless for analysis)
                if len(unique_vals) > 1:
                    trait_row = idx
                    print(f"Identified trait row at index {idx}")
        
        # Identify age row
        if age_row is None and ('age' in value_str or any(re.search(r'\d+\s*(?:year|yr)', str(v), re.IGNORECASE) for v in unique_vals)):
            age_row = idx
            print(f"Identified age row at index {idx}")
        
        # Identify gender row
        if gender_row is None and any(term in value_str for term in ['male', 'female', 'gender', 'sex']):
            gender_row = idx
            print(f"Identified gender row at index {idx}")

# Define conversion functions for each variable
def convert_trait(value):
    """Convert trait value to binary (0=control, 1=case)"""
    if pd.isna(value):
        return None
    
    value = str(value).lower()
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    control_terms = ['control', 'healthy', 'normal', 'non-diabetic', 'nondiabetic', 'negative']
    case_terms = ['diabetes', 't1d', 'patient', 'case', 'type 1', 'positive', 'diabetic']
    
    if any(term in value for term in control_terms):
        return 0
    elif any(term in value for term in case_terms):
        return 1
    else:
        return None

def convert_age(value):
    """Convert age value to continuous numeric value"""
    if pd.isna(value):
        return None
    
    value = str(value)
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Extract numeric age
    age_match = re.search(r'(\d+(?:\.\d+)?)', value)
    if age_match:
        try:
            return float(age_match.group(1))
        except ValueError:
            return None
    return None

def convert_gender(value):
    """Convert gender value to binary (0=female, 1=male)"""
    if pd.isna(value):
        return None
    
    value =


### Step 5: Gene Data Extraction

In [None]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


### Step 6: Gene Identifier Review

In [None]:
# The gene identifiers shown are just numeric values (1, 2, 3, 9, 10, etc.)
# These appear to be Entrez Gene IDs/NCBI Gene IDs rather than human gene symbols
# Standard human gene symbols would be alphanumeric like GAPDH, TP53, TNF, etc.
# Therefore, we need to map these IDs to human gene symbols for better interpretability

requires_gene_mapping = True


### Step 7: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


### Step 8: Gene Identifier Mapping

In [None]:
# 1. Identify which columns in the gene annotation contain the identifiers and symbols
# From the preview, we can see the gene annotation has ID and ENTREZ_GENE_ID columns
# The gene expression data uses numeric IDs as shown in a previous step

# Based on the platform title "Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray 039381 (Entrez Gene ID version)"
# This confirms we need to map Entrez Gene IDs to gene symbols
# However, I don't see gene symbols in the current annotation preview

# Let's try to find gene symbols from the full annotation dataframe
print("Columns in gene annotation:")
print(gene_annotation.columns)

# Check if there's a column that might contain gene symbols
symbol_columns = [col for col in gene_annotation.columns if 'SYMBOL' in col.upper() or 'GENE' in col.upper()]
print(f"Potential symbol columns: {symbol_columns}")

# If we don't find gene symbols, we need to extract additional annotation from the SOFT file
# Let's search specifically for gene symbols in the SOFT file
gene_symbols = {}
entrez_pattern = re.compile(r'^(\d+)\s+.*?(?:GENE_SYMBOL|GENE NAME)[\s=]+([\w\-\.]+)', re.IGNORECASE)
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        match = entrez_pattern.search(line)
        if match:
            entrez_id, symbol = match.groups()
            gene_symbols[entrez_id] = symbol

# If we still don't have gene symbols, use the extract_human_gene_symbols function on any text descriptions
if not gene_symbols and 'DESCRIPTION' in gene_annotation.columns:
    for idx, row in gene_annotation.iterrows():
        entrez_id = str(row['ID'])
        description = str(row['DESCRIPTION'])
        symbols = extract_human_gene_symbols(description)
        if symbols:
            gene_symbols[entrez_id] = symbols[0]

# If we have found gene symbols, create a mapping dataframe
if gene_symbols:
    print(f"Found {len(gene_symbols)} gene symbol mappings")
    mapping_df = pd.DataFrame({'ID': list(gene_symbols.keys()), 'Gene': list(gene_symbols.values())})
elif 'GENE_SYMBOL' in gene_annotation.columns:
    # If GENE_SYMBOL column exists, use it directly
    mapping_df = gene_annotation[['ID', 'GENE_SYMBOL']].rename(columns={'GENE_SYMBOL': 'Gene'})
else:
    # If we still don't have gene symbols, use Entrez ID to gene symbol mapping from external resources
    # First, try to use gene_id as a fallback (converting Entrez IDs to symbols)
    try:
        # Read a pre-existing mapping file if available
        gene_id_to_symbol = pd.read_csv("./metadata/entrez_to_symbol.csv", index_col=0, squeeze=True).to_dict()
        # Apply the mapping to create a new dataframe
        mapping_df = pd.DataFrame({
            'ID': gene_annotation['ID'],
            'Gene': gene_annotation['ID'].map(gene_id_to_symbol)
        })
        mapping_df = mapping_df.dropna()  # Drop rows where we couldn't map the ID
    except:
        # As a last resort, use ID as both identifier and gene symbol
        print("WARNING: Could not find gene symbols. Using Entrez IDs as gene symbols.")
        mapping_df = gene_annotation[['ID']].copy()
        mapping_df['Gene'] = mapping_df['ID']

# Print a preview of the mapping dataframe
print("\nGene mapping preview:")
print(mapping_df.head())

# 2. Apply the gene mapping to convert probe-level measurements to gene expression data
# Use the apply_gene_mapping function from the library
gene_data = apply_gene_mapping(gene_data, mapping_df)

# 3. Print information about the resulting gene expression dataframe
print(f"\nGene expression data after mapping:")
print(f"Shape: {gene_data.shape}")
print(f"First few gene symbols: {list(gene_data.index[:10])}")

# 4. Save the gene data to the specified output file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")