### Step 1: Initial Data Loading

In [None]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Post-Traumatic_Stress_Disorder/GSE85399'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available based on the dataset description
dataset_information = """
Emerging knowledge suggests that post-traumatic stress disorder (PTSD) is causally associated with epigenetic changes 
although its molecular underpinnings are still largely elusive...
Agilent whole genome array detected ~5,600 differentially methylated CpG islands (CpGI) annotated to ~2,800 differentially 
methylated genes (DMG)... In this study, further validation was conducted by an independent test set comprising of 31/29 PTSD+/- veteran.
"""
if 'whole genome array' in dataset_information:
    is_gene_available = True

# Determine variable availability and data types
sample_characteristics = {
    0: ['subject status: post-traumatic stress disorder (PTSD) negative (-)', 'subject status: post-traumatic stress disorder (PTSD) positive (+)'],
    1: ['gender: Male'],
    2: ['age (yrs): 27', 'age (yrs): 40', 'age (yrs): 29', 'age (yrs): 32', 'age (yrs): 30', 'age (yrs): 24', 'age (yrs): 26', 'age (yrs): 31', 'age (yrs): 37', 'age (yrs): 34', 'age (yrs): 44', 'age (yrs): 28', 'age (yrs): 33', 'age (yrs): 23', 'age (yrs): 35', 'age (yrs): 45', 'age (yrs): 36', 'age (yrs): 48'],
    3: ['ethnicity: Hispanic', 'ethnicity: Non-Hispanic Black', 'ethnicity: Non-Hispanic Asian', 'ethnicity: Non-Hispanic white', 'ethnicity: Non-Hispanic Other'],
    4: ['caps: 3', 'caps: 0', 'caps: 18', 'caps: 6', 'caps: 10', 'caps: 8', 'caps: 21', 'caps: 4', 'caps: 2', 'caps: 19', 'caps: 7', 'caps: 9', 'caps: 70', 'caps: 82', 'caps: 68', 'caps: 58', 'caps: 71', 'caps: 46', 'caps: 95', 'caps: 90', 'caps: 54', 'caps: 50', 'caps: 92', 'caps: 81', 'caps: 44', 'caps: 87', 'caps: 64', 'caps: 77', 'caps: 51', 'caps: 75'],
    5: ['tissue: Whole blood']
}

# Determine keys for the variables
trait_values = sample_characteristics.get(0, [])
gender_values = sample_characteristics.get(1, [])
age_values = sample_characteristics.get(2, [])

if len(trait_values) > 1:
    trait_row = 0

if len(gender_values) > 1: # Although the dataset mentions only "male"
    gender_row = 1
else:
    gender_values = gender_values * len(sample_characteristics[0])  # Fill with inferred gender data
    gender_row = 1

if len(age_values) > 1:
    age_row = 2

# Define conversion functions
def convert_trait(value):
    parts = value.split(":")
    status = parts[-1].strip().lower()
    return 1 if "positive" in status else 0 if "negative" in status else None

def convert_age(value):
    parts = value.split(":")
    try:
        return float(parts[-1].strip())
    except ValueError:
        return None

def convert_gender(value):
    parts = value.split(":")
    gender = parts[-1].strip().lower()
    return 1 if gender == 'male' else 0 if gender == 'female' else None

# Save metadata
save_cohort_info('GSE85399', './preprocessed/Post-Traumatic_Stress_Disorder/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Post-Traumatic_Stress_Disorder', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Post-Traumatic_Stress_Disorder/trait_data/GSE85399.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


### Step 3: Gene Data Extraction

In [None]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])
    

# Given the indices are sequential numbers, they are not human gene symbols or standard gene identifiers.


### Step 4: Gene Identifier Review

In [None]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


### Step 6: Gene Identifier Mapping

In [None]:
# 1. Determine the keys for identifiers and gene symbols
identifier_key = 'SPOT_ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to obtain the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)
