### Step 1: Initial Data Loading

In [None]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Adrenocortical_Cancer/GSE33371'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
series_overall_design = "Human samples of 33 adrenocortical carcinomas, 22 adrenocortical adenomas, and 10 normal adrenal cortex samples, each from a different patient, had mRNA assays performed using Affymetrix HG_U133_plus_2 arrays, with 54675 probe-sets."
if "mRNA" in series_overall_design:
    is_gene_available = True

# Variable Availability and Data Type Conversion

# Data availability
sample_characteristics = {
    0: ['age: 71', 'age: 58', 'age: 44', 'age: 32', 'age: 28', ..., 'age: 31'], 
    1: ['Sex: F', 'Sex: M'],
    2: ['side of body: Left', ...], 3: ['clinical characteristics: Adrenalectomy for metastatic lung carcinoma', ...],
    4: ['tumor diameter (cm): not applicable', ...], 5: ['tumor weight (gm): not applicable', ...], 
    6: ['weiss score of tumor: not applicable', ...], 7: ['mitotic rate of tumor: not applicable', ...], 
    8: ['tumor stage: not applicable', ...], 9: ['years to last followup: unknown', ...], 
    10: ['dead or alive at last followup: unknown', ...], 
    11: ['beta-catenin staining: Membrane', ...], 12: ['beta-catenin sequence result: unknown', ...], 
    13: ['hormone expression: unknown', ...]
}

# Check if age is available
for key, values in sample_characteristics.items():
    if any("age" in v for v in values):
        age_row = key
        break

# Check if gender is available
for key, values in sample_characteristics.items():
    if any("Sex" in v for v in values):
        gender_row = key
        break

# Check if Adrenocortical_Cancer is available
for key, values in sample_characteristics.items():
    if any("Adrenocortical carcinoma" in v for v in values):
        trait_row = key
        break

# Data Type Conversion Functions

def convert_trait(value):
    return 1 if "Adrenocortical carcinoma" in value else 0

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except ValueError:
        return None

def convert_gender(value):
    gender = value.split(': ')[1]
    return 1 if gender == 'M' else 0

save_cohort_info('GSE33371', './preprocessed/Adrenocortical_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Adrenocortical_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Adrenocortical_Cancer/trait_data/GSE33371.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
