### Step 1: Initial Data Loading

In [None]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hypothyroidism/GSE11138'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [None]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable


### Step 2.1: Determine if the dataset contains gene expression data: Unknown Step

In [None]:
# Based on the description, this dataset does contain gene expression data because it involves "differentially expressed genes"
is_gene_available = True


### Step 2.2: Determine the availability of the variables: Unknown Step

In [None]:
# Variables in the sample characteristics dictionary
sample_characteristics_dict = {
    0: ['patient number: 1', 'lad coronary plaque segment derived from patient 1 and different from which used for experiments 1 and 2 (ex1 and ex2) was used in this experiment. patient number: 1', 
        'lad coronary plaque segment derived from patient 1 and different from which used for experiments 1 and 2 (ex1 and ex2) was used in this experiment. this is technical replicate for the ex3. patient number: 1', 
        'patient number: 4', 'patient number: 6', 'patient number: 2', 'lad coronary plaque segment derived from patient 2 and different from which used for experiments 9 and 10 (ex9 and ex10) was used in this experiment. patient number: 2', 
        'patient number: 8', 'patient number: 5', 'lad coronary plaque segment derived from patient 8 and different from which used for experiments 17 and 18 (ex17 and ex18) was used in this experiment. patient number: 8', 
        'patient number: 3', 'patient number: 7', 'lad coronary plaque segment derived from patient 8 and different from which used for experiments 17 and 18 (ex17 and ex18) was used in this experiment. patient number: 8'], 
    1: ['sex: M', 'sex: F'], 
    2: ['age (years): 50', 'age (years): 64', 'age (years): 42', 'age (years): 68', 'age (years): 66', 'age (years): 53'], 
    3: ['tissue: LAD'], 
    4: ['pathology: Ischemic'], 
    5: ['comorbidities: Hypothyroidism', 'comorbidities: -', 'comorbidities: Pulmonary arterial hypertension (PAH)', 'comorbidities: Cholecystectomy', 'comorbidities: Stone Kidney'], 
    6: ['pharm. therapies: Diuretics, Anti-arrhythmics, Anti-coagulants, Hypertension regulators', 'pharm. therapies: Diuretics, Hypertension regulators', 'pharm. therapies: Diuretics, Anti-coagulants', 
        'pharm. therapies: Diuretics, Anti-arrhythmics, Hypertension regulators', 'pharm. therapies: Diuretics, Anti-arrhythmics, Hypertension regulators, Statins', 'pharm. therapies: Anti-arrhythmics, Hypertension regulators'], 
    7: ['plasma lipid/stenosis: +/ >75%', 'plasma lipid/stenosis: =/ >75%']
}

# Determine the values for trait_row, age_row, and gender_row
trait_row = 5  # 'comorbidities' feature contains 'Hypothyroidism'
age_row = 2  # 'age (years)'
gender_row = 1  # 'sex'


### Step 2.3: Data type conversion functions: Unknown Step

In [None]:

def convert_trait(value):
    try:
        val = value.split(': ')[1].strip().lower()  # normalize case
        if 'hypothyroidism' in val:
            return 1
        elif val == '-':
            return 0
        else:
            return None
    except Exception as e:
        return None

def convert_age(value):
    try:
        return float(value.split(': ')[1].strip())
    except Exception as e:
        return None

def convert_gender(value):
    try:
        val = value.split(': ')[1].strip()
        if val == 'M':
            return 1
        elif val == 'F':
            return 0
        else:
            return None
    except Exception as e:
        return None


### Step 3: Save metadata: Unknown Step

In [None]:
save_cohort_info('GSE11138', './preprocessed/Hypothyroidism/cohort_info.json', is_gene_available, trait_row is not None)


### Step 4: Clinical feature extraction: Unknown Step

In [None]:
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hypothyroidism', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hypothyroidism/trait_data/GSE11138.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


### Step 3: Gene Data Extraction

In [None]:

# Redefine the 'get_genetic_data' function
def get_genetic_data(file_path, marker="!series_matrix_table_begin"):
    """Read the gene expression data into a dataframe, and adjust its format"""
    # Determine the number of rows to skip
    with gzip.open(file_path, 'rt') as file:
        for i, line in enumerate(file):
            if marker in line:
                skip_rows = i + 1  # +1 to skip the marker row itself
                break
        else:
            raise ValueError(f"Marker '{marker}' not found in the file.")

    # Read the genetic data into a dataframe
    genetic_data = pd.read_csv(file_path, compression='gzip', skiprows=skip_rows, comment='!', delimiter='\t',
                               on_bad_lines='skip')
    genetic_data = genetic_data.dropna()
    
    # Check and handle multiple possible column names
    possible_id_cols = ['ID_REF', 'ID', 'gene_id']
    id_col = next((col for col in possible_id_cols if col in genetic_data.columns), None)
    if id_col:
        genetic_data = genetic_data.rename(columns={id_col: 'ID'}).astype({'ID': 'str'})
        genetic_data.set_index('ID', inplace=True)
    else:
        raise ValueError("None of the expected identifier columns ('ID_REF', 'ID', 'gene_id') were found in the genetic data file.")

    return genetic_data

# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


# Placeholder for the output from STEP3
row_headers = ["ENSG00000141510", "ENSG00000171298", "ENSG00000165029"]  # Example from the gene expression dataset

# Determine if mapping is required based on row headers
def requires_gene_mapping(row_headers):
    # Check if entries in `row_headers` match the format of ENSG identifiers
    if all(header.startswith("ENSG") for header in row_headers):
        return True  # These are Ensembl Gene IDs and require mapping to human gene symbols
    
    # If the headers are already in human gene symbol format, no mapping is needed
    return False

# Execute function to determine if mapping is required


### Step 4: Gene Identifier Review

In [None]:
requires_gene_mapping = requires_gene_mapping(row_headers)


### Step 5: Gene Annotation (Conditional)

In [None]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


### Step 6: Gene Identifier Mapping

In [None]:

# This placeholder value should be set based on the actual determination


### Step 4: Gene Identifier Review

In [None]:
requires_gene_mapping = True  # Example value, set this according to previous determination

if requires_gene_mapping:
    # Make sure gene_data is defined
    gene_data = get_genetic_data(matrix_file)
    
    # 1. Determine key names for identifiers and gene symbols
    identifier_key = 'ID'
    gene_symbol_key = 'Gene_Symbol'

    # 2. Get the dataframe storing the mapping between probe IDs and genes
    probe_gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

    # 3. Apply the mapping to get the gene expression dataframe
    gene_data = apply_gene_mapping(gene_data, probe_gene_mapping)

    # Print the first 20 row ids to verify the mapping
    print(gene_data.index[:20])


# Ensure gene_data is defined properly before proceeding to normalization.
# Assuming gene_data is an existing variable from previous steps.

if requires_gene_mapping:
    gene_data = get_genetic_data(matrix_file)

    # 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
    normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
    gene_csv_path = './preprocessed/Hypothyroidism/gene_data/GSE11138.csv'
    normalized_gene_data.to_csv(gene_csv_path)

    # 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
    merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

    # 3. Determine whether the trait and some demographic attributes in the data are severely biased, and remove biased attributes.
    trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Hypothyroidism')

    # If the trait is not severely biased, save the cohort information and the merged data.

    # 4. Save the cohort information.
    save_cohort_info('GSE11138', './preprocessed/Hypothyroidism/cohort_info.json', True, True, trait_biased, merged_data)

    if not trait_biased:
        # 5. If the trait is not severely biased, save the merged data to a csv file.
        csv_path = './preprocessed/Hypothyroidism/GSE11138.csv'
        unbiased_merged_data.to_csv(csv_path)
