In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Longevity_and_Aging/GSE44147'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomic effects of caloric restriction reflected in primate difference"
!Series_summary	"Caloric restriction (CR) can delay morbidity and mortality in a broad range of species, including mice and macaques. Mutations and chemical agents, such as resveratrol or rapamycin that partly mimic the CR effect, can similarly increase survival or extend lifespan. In humans, however, the effects of CR or other life-extending agents have not been investigated systematically. Humans already display lower mortality and greater maximal lifespan compared to closely related species, including chimpanzees and macaques. It is thus possible that humans, during their evolution, have acquired genetic mutations mimicking the CR effect. To address this question, we compared transcriptome differences between humans and other primates, with transcriptome changes observed in mice subjected to CR [see references below]. We show that the human transcriptome state examin

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if the dataset contains gene expression data
is_gene_available = True  # Based on the RNA hybridization to Affymetrix® Human Gene 1.0 ST arrays

# Sample Characteristics Dictionary from Output of Step 1
sample_characteristics_dict = {
    0: ['strain: C57BL/6'], 
    1: ['tissue: prefrontal cortex of the brain'], 
    2: ['age: 2 days', 'age: 5 days', 'age: 11 days', 'age: 20 days', 'age: 32 days', 'age: 61 days', 'age: 122 days', 'age: 184 days', 'age: 365 days', 'age: 649 days', 'age: 904 days']
}

# Identify the rows where the variables are recorded
for key, values in sample_characteristics_dict.items():
    if all('age' in value for value in values):
        age_row = key
        break

# No data related to 'Longevity_and_Aging' or 'gender' available in the dictionary
trait_row = None
gender_row = None

# Define conversion functions
def convert_age(value):
    try:
        return float(value.split(':')[-1].strip().split()[0])
    except (ValueError, IndexError):
        return None

def convert_trait(value):
    return None  # Placeholder as no trait data is found in current sample_characteristics_dict

def convert_gender(value):
    val = value.split(':')[-1].strip().lower()
    if val == 'female':
        return 0
    elif val == 'male':
        return 1
    else:
        return None

# Save cohort information
save_cohort_info('GSE44147', './preprocessed/Longevity_and_Aging/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction (to be done only if we could find trait_row, which is None here)
