In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE182120"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE182120"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE182120.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE182120.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE182120.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [None]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
# 1. Gene Expression Data Availability
# Based on background information and series title, this appears to be an Affymetrix array dataset
# which typically contains gene expression data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability

# For Type 2 Diabetes (trait)
# From the sample characteristics, key 0 contains 'disease: NGT' and 'disease: T2D'
# NGT = Normal Glucose Tolerance, T2D = Type 2 Diabetes
trait_row = 0  

# For age
# Key 1 contains age information for the subjects
age_row = 1  

# For gender
# No gender information is available in the sample characteristics
gender_row = None  

# 2.2 Data Type Conversion

# Trait conversion function (binary)
def convert_trait(value):
    if not isinstance(value, str):
        return None
    
    value = value.lower().strip()
    if 'disease:' in value:
        value = value.split('disease:')[1].strip()
    
    if 't2d' in value:
        return 1  # Type 2 Diabetes
    elif 'ngt' in value:
        return 0  # Normal Glucose Tolerance
    else:
        return None

# Age conversion function (continuous)
def convert_age(value):
    if not isinstance(value, str):
        return None
    
    value = value.lower().strip()
    if 'age:' in value:
        try:
            age = int(value.split('age:')[1].strip())
            return age
        except ValueError:
            return None
    return None

# Gender conversion function - not needed since gender data is not available
def convert_gender(value):
    return None  # Placeholder function

# 3. Save Metadata
# Initial filtering on the usability of the dataset
is_trait_available = trait_row is not None
validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path, 
                             is_gene_available=is_gene_available, 
                             is_trait_available=is_trait_available)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features
    clinical_features = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the data
    preview = preview_df(clinical_features)
    print("Clinical Features Preview:")
    print(preview)
    
    # Save the clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_features.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to: {out_clinical_data_file}")


### Step 3: Gene Data Extraction

In [None]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row IDs (gene or probe identifiers) for future observation.
print(gene_data.index[:20])


### Step 4: Gene Identifier Review

In [None]:
# Based on the observed gene identifiers (e.g., '2824546_st', '2824549_st', etc.), these appear to be 
# probe identifiers from a microarray platform, not standard human gene symbols.
# These identifiers have a format typical of Affymetrix probe IDs, which need to be mapped to 
# standard gene symbols for meaningful biological interpretation.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


### Step 6: Gene Identifier Mapping

In [None]:
# 1. First, observe the gene expression data and gene annotation data to find the appropriate columns
# Looking at gene_data.index, we see identifiers like '2824546_st'
# Looking at gene_annotation, we need to find matching columns

# From reviewing gene_annotation columns, we need to determine which columns to use
# The 'ID' column in gene_annotation should contain the same probes as in gene_data
# The 'gene_assignment' column appears to contain gene symbol information

# 2. Get gene mapping dataframe using the identified columns
mapping_data = get_gene_mapping(gene_annotation, prob_col='ID', gene_col='gene_assignment')

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_data)

# Print some information about the mapped gene data
print(f"Number of genes after mapping: {len(gene_data.index)}")
print("First 10 gene symbols:")
print(gene_data.index[:10])


### Step 7: Data Normalization and Linking

In [None]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)

# 2. Load the clinical data that was saved in an earlier step
clinical_features = pd.read_csv(out_clinical_data_file)

# Examine the columns
print("Clinical data columns:", clinical_features.columns.tolist())

# Since the trait name is not included as a column in the clinical data,
# we need to create the proper column names
# The first column is typically our trait of interest
if len(clinical_features.columns) > 0:
    # Rename the first column to match our trait name
    new_columns = [trait] + clinical_features.columns[1:].tolist()
    clinical_features.columns = new_columns
    print("Updated clinical data columns:", clinical_features.columns.tolist())

# Link the clinical and genetic data 
clinical_features = clinical_features.set_index(clinical_features.iloc[:, 0].index)
linked_data = geo_link_clinical_genetic_data(clinical_features, normalized_gene_data)

# Check the linked data structure
print("Linked data shape:", linked_data.shape)
print("Linked data columns sample:", list(linked_data.columns[:5]))

# 3. Handle missing values in the linked data
linked_data = handle_missing_values(linked_data, trait)

# 4. Determine whether the trait and some demographic features are severely biased, and remove biased features.
is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Conduct quality check and save the cohort information.
is_usable = validate_and_save_cohort_info(
    is_final=True, 
    cohort=cohort, 
    info_path=json_path, 
    is_gene_available=True, 
    is_trait_available=True, 
    is_biased=is_trait_biased, 
    df=linked_data
)

# 6. If the linked data is usable, save it as a CSV file to 'out_data_file'.
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    unbiased_linked_data.to_csv(out_data_file)


### Step 8: Data Normalization and Linking

In [None]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
normalized_gene_data.to_csv(out_gene_data_file)

# 2. Link the clinical and genetic data with the 'geo_link_clinical_genetic_data' function from the library.
linked_data = geo_link_clinical_genetic_data(selected_clinical_df, normalized_gene_data)

# 3. Handle missing values in the linked data
linked_data = handle_missing_values(linked_data, trait)

# 4. Determine whether the trait and some demographic features are severely biased, and remove biased features.
is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data, trait)

# 5. Conduct quality check and save the cohort information.
is_usable = validate_and_save_cohort_info(True, cohort, json_path, True, True, is_trait_biased, linked_data)

# 6. If the linked data is usable, save it as a CSV file to 'out_data_file'.
if is_usable:
    unbiased_linked_data.to_csv(out_data_file)