In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Age-Related_Macular_Degeneration/GSE43176'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Wild-Type Nras Lacks Tumor Suppressor Activity and Nras Oncogene Dosage Strongly Modulates Hematopoietic Transformation"
!Series_summary	"Contemporary treatment of pediatric acute myeloid leukemia (AML) requires the assignment of patients to specific risk groups. To explore whether expression profiling of leukemic blasts could accurately distinguish between the known risk groups of AML, we analyzed 130 pediatric and 20 adult AML diagnostic bone marrow or peripheral blood samples using the Affymetrix U133A microarray.  Class discriminating genes were identified for each of the major prognostic subtypes of pediatric AML, including t(15;17)[PML-RARalpha], t(8;21)[AML1-ETO], inv(16) [CBFbeta-MYH11], MLL chimeric fusion genes, and cases classified as FAB-M7. When subsets of these genes were used in supervised learning algorithms, an overall classification accuracy of more than 93% was achieved.  Moreover, we were able to use the expression signatures g

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check Gene Expression Data Availability
series_summary = """Contemporary treatment of pediatric acute myeloid leukemia (AML) requires the assignment of patients to specific risk groups. To explore whether expression profiling of leukemic blasts could accurately distinguish between the known risk groups of AML, we analyzed 130 pediatric and 20 adult AML diagnostic bone marrow or peripheral blood samples using the Affymetrix U133A microarray..."""
is_gene_available = "Affymetrix U133A microarray" in series_summary

# Sample Characteristics Check
sample_characteristics = {
    0: ['disease state: AML', 'disease state: normal bone marrow sample'],
    # rest of the dictionary...
}

def find_variable_key(characteristics_dict, search_terms):
    for key, values in characteristics_dict.items():
        for value in values:
            for term in search_terms:
                if term in value:
                    return key
    return None

# For trait 'Age-Related_Macular_Degeneration'
trait_terms = ["Age-Related Macular Degeneration", "AMD"]
trait_row = find_variable_key(sample_characteristics, trait_terms)
convert_trait = lambda val: None  # assuming no valid data found

# For 'age'
age_terms = ["age"]
age_row = find_variable_key(sample_characteristics, age_terms)
convert_age = lambda val: float(val.split(":")[1]) if age_row is not None else None

# For 'gender'
gender_terms = ["gender", "sex"]
gender_row = find_variable_key(sample_characteristics, gender_terms)
convert_gender = lambda val: 0 if 'female' in val.lower() else (1 if 'male' in val.lower() else None)

# Check if variables were not found
if trait_row is None: trait_row = None
if age_row is None: age_row = None
if gender_row is None: gender_row = None

# Save Metadata
save_cohort_info('GSE43176', './preprocessed/Age-Related_Macular_Degeneration/cohort_info.json', is_gene_available, trait_row is not None)

# Since trait_row is None, we do not have clinical data processing
