In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hypertrophic_Cardiomyopathy/GSE9800'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression signature of cardiac muscle as a potential diagnostic or prognostic tool for dilated cardiomyopathy"
!Series_summary	"There is an emerging hypothesis that dilated cardiomyopathy (DCM) is a manifestation of end-stage heart failure (ESHF) resulting from “final common pathway” despite heterogeneous primary etiologies. We performed genome-wide expression profiling by means of high-density oligonucleotide microarrays using cardiac muscles from patients with DCM or specific cardiomyopathy as well as non-disease control hearts. Differentially expressed genes between ESHF and non-disease samples should include both genes reactive to heart failure (HF) and those responsible for ESHF. With the aid of samples with acute HF without DCM and those with DCM without HF (corrected with left ventricular assist device), we successfully distinguished ESHF genes from HF genes. Our findings implicate that transcriptional signature of cardiac muscle can be po

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Define sample_characteristics as provided in the previous output
sample_characteristics = {
    0: ['Normal myocardium sold commercially', 'myocardium sold commercially', 'Left ventricular myocardium in patients with Dilated cardiomyopathy', 'Left ventricular myocardium in patients with Eosinophilic myocarditis', 'Left ventricular myocardium in patients with Cardiac sarcoidosis', 'Left ventricular myocardium in patients with Ischemic cardiomyopathy', 'Left ventricular myocardium in patients with Peripartal cardiomyopathy', 'Left ventricular myocardium in patients with Alcoholic myopathy', 'Pooled normal myocardium sold commercially', 'Left ventricular myocardium in patients with Hypertrophic cardiomyopathy', 'Non-disease heart']
}

# Gene Expression Data Availability
is_gene_available = True  # Based on the description, gene expression data is available

# Variable Availability and Data Type Conversion
trait_row = 0 if any('Hypertrophic cardiomyopathy' in s.lower() for s in sample_characteristics[0]) else None
age_row = None  # Assuming no age information is found
gender_row = None  # Assuming no gender information is found

# Converting trait data
def convert_trait(value):
    value = value.lower()
    if 'hypertrophic cardiomyopathy' in value:
        return 1
    elif 'non-disease heart' in value or 'normal myocardium' in value:
        return 0
    return None

convert_age = None
convert_gender = None

# Save Metadata
save_cohort_info('GSE9800', './preprocessed/Hypertrophic_Cardiomyopathy/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hypertrophic_Cardiomyopathy', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hypertrophic_Cardiomyopathy/trait_data/GSE9800.csv'
    if not selected_clinical_data.empty:  # Ensure data is meaningful before saving
        selected_clinical_data.to_csv(csv_path)
        print(preview_df(selected_clinical_data))
