### Step 1: Initial Data Loading

In [None]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Kidney_Chromophobe/GSE11024'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import numpy as np
sample_characteristics_dict = {
    0: ['Age:46', 'Age:61', 'Age:67', 'Age:47', 'Age:38', 'Age:72', 'Age:34', 'Age:63', 'Age:45', 'Age:53', 'Age:79', 'Age:52', 'Age:73', 'Age:37', 'Age:44', 'Age:74', 'Age:2', 'Age:12', 'Age:4', 'Age:58', 'Age:1', 'Age:36', 'Age:59', 'Age:68', 'Age:77', 'Age:70', 'Age: Unknown', 'Age:69', 'Age:86', 'Age:55'], 
    1: ['Gender: F', 'Gender: M', 'Gender: Unknown'], 
    2: ['Stage: 1', 'Stage: 2', 'Stage: 3', 'Renal Tumor', 'Normal kidney', 'Stage: 4', 'Stage: Unknown'], 
    3: ['Renal Tumor', None, 'Normal kidney']
}

# Analysis based on given data
is_gene_available = True  # Given it is a microarray dataset, it likely contains gene expression data.

# Identify the keys for the variables
age_row = 0
gender_row = 1

# Trait row identification
trait_row = None
for key, values in sample_characteristics_dict.items():
    if 'Kidney_Chromophobe' in str(values):
        trait_row = key
        break

if trait_row is None:
    # Assuming Kidney_Chromophobe corresponds to 'Renal Tumor'
    trait_row = 3

# Data type conversion functions
def convert_trait(value):
    if value:
        value = value.split(':')[-1].strip()
        if value == 'Renal Tumor':
            return 1
        elif value == 'Normal kidney':
            return 0
    return None

def convert_age(value):
    if value:
        value = value.split(':')[-1].strip()
        if value.isdigit():
            return int(value)
    return None

def convert_gender(value):
    if value:
        value = value.split(':')[-1].strip()
        if value == 'M':
            return 1
        elif value == 'F':
            return 0
    return None

# Save cohort info
save_cohort_info('GSE11024', './preprocessed/Kidney_Chromophobe/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Kidney_Chromophobe', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Kidney_Chromophobe/trait_data/GSE11024.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


### Step 3: Gene Data Extraction

In [None]:
# Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# Print the first 20 row ids for the following step.
print(gene_data.index[:20])


### Step 4: Gene Identifier Review

In [None]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


### Step 6: Gene Identifier Mapping

In [None]:
# 1. Determine the keys for identifier and gene symbol in the gene annotation dictionary.
identifier_key = 'ID'
gene_symbol_key = 'Gene'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library to get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, gene_mapping)
