In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE180395"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE180395"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE180395.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE180395.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE180395.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
# 1. Analyze if gene expression data is present
# Based on the information, this is likely to be a transcriptome series so gene data could be available
is_gene_available = True

# 2.1 Identifying rows for trait, age, and gender
# Looking at the sample characteristics dictionary, we can see:
# Row 0 contains sample groups that include "DN" (Diabetic Nephropathy) which is related to Type 2 Diabetes
trait_row = 0
# There's no information about age or gender in the sample characteristics
age_row = None
gender_row = None

# 2.2 Defining conversion functions
def convert_trait(value):
    """Convert sample group descriptions to binary trait values for Type 2 Diabetes."""
    if value is None or not isinstance(value, str):
        return None
    
    # Extract the value part (after the colon if present)
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # DN (Diabetic Nephropathy) is associated with Type 2 Diabetes
    if 'DN' in value:
        return 1
    else:
        return 0

def convert_age(value):
    """Convert age values to continuous values."""
    # Not used as age data is not available
    return None

def convert_gender(value):
    """Convert gender values to binary (0 for female, 1 for male)."""
    # Not used as gender data is not available
    return None

# 3. Save metadata - initial filtering
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Extract clinical features if trait data is available
if trait_row is not None:
    # Read the clinical data
    clinical_data_file = os.path.join(in_cohort_dir, "clinical_data.csv")
    if os.path.exists(clinical_data_file):
        clinical_data = pd.read_csv(clinical_data_file)
        
        # Extract clinical features
        selected_clinical_df = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the extracted clinical data
        preview = preview_df(selected_clinical_df)
        print(f"Clinical data preview: {preview}")
        
        # Save the selected clinical features to CSV
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        selected_clinical_df.to_csv(out_clinical_data_file, index=False)
        print(f"Clinical data saved to {out_clinical_data_file}")


### Step 3: Gene Data Extraction

In [None]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row IDs (gene or probe identifiers) for future observation.
print(gene_data.index[:20])


### Step 4: Gene Identifier Review

In [None]:
# Examining the gene identifiers in the gene expression data
# These IDs appear to be in the format "number_at" which is commonly used in Affymetrix microarray platforms
# They are not standard human gene symbols (which would look like BRCA1, TP53, etc.)
# These are probe IDs that need to be mapped to human gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


### Step 6: Gene Identifier Mapping

In [None]:
# 1. Let's examine the gene_annotation dataframe to identify the issue
print("Gene annotation columns:", gene_annotation.columns.tolist())
print("Gene annotation shape:", gene_annotation.shape)

# Let's look at more rows of the annotation data
print("More rows of gene annotation:")
print(gene_annotation.head(10).to_string())

# 2. The issue is that ENTREZ_GENE_ID values are numeric IDs, not gene symbols
# We need to modify our approach to use these IDs directly
mapping_df = gene_annotation.copy()
mapping_df = mapping_df.rename(columns={'ENTREZ_GENE_ID': 'Gene'})

# Print the mapping dataframe for verification
print("Gene mapping dataframe preview:")
print(preview_df(mapping_df))
print("Number of mappings:", len(mapping_df))

# Check how many probes in gene_data are also in mapping_df
probe_ids_in_gene_data = set(gene_data.index)
probe_ids_in_mapping = set(mapping_df['ID'])
common_ids = probe_ids_in_gene_data.intersection(probe_ids_in_mapping)
print(f"Number of probe IDs in gene_data: {len(probe_ids_in_gene_data)}")
print(f"Number of probe IDs in mapping_df: {len(probe_ids_in_mapping)}")
print(f"Number of common probe IDs: {len(common_ids)}")

# 3. Apply gene mapping without using extract_human_gene_symbols
# We need to modify the mapping approach to work with Entrez IDs
# Create a mapping between probes and their Entrez gene IDs
mapping_data = mapping_df.dropna(subset=['Gene'])
mapping_data['Gene'] = mapping_data['Gene'].astype(str)

# Create a new dataframe with gene expression values
gene_expression = pd.DataFrame(index=mapping_data['Gene'].unique(), columns=gene_data.columns)

# For each probe, distribute its expression to its corresponding gene(s)
for probe_id in common_ids:
    if probe_id in gene_data.index:
        # Get all genes that this probe maps to
        genes = mapping_data[mapping_data['ID'] == probe_id]['Gene'].tolist()
        if genes:
            # Get the number of genes for this probe
            num_genes = len(genes)
            # Divide the expression value by the number of genes
            expression_values = gene_data.loc[probe_id] / num_genes
            # Add the expression values to each gene
            for gene in genes:
                if gene in gene_expression.index:
                    gene_expression.loc[gene] += expression_values
                else:
                    gene_expression.loc[gene] = expression_values

# Remove rows with all NaN values
gene_data = gene_expression.dropna(how='all')

# Print the shape of the resulting gene expression dataframe
print(f"Gene expression data shape: {gene_data.shape}")
print("Gene expression data preview (first few genes):")
print(preview_df(gene_data.head()))

# If the mapping still produces empty data, use the original gene_data
if gene_data.shape[0] == 0:
    print("Warning: Gene mapping produced empty results. Using probe IDs directly.")
    # Normalize the probe IDs to be strings
    gene_data = gene_data.copy()
    # Print diagnostic information
    print(f"Original gene_data shape: {gene_data.shape}")