In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Cognitive_Processing/GSE201248'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Individual Variability in Human Cell Type Transcriptomes and Epigenomes [YF1A]"
!Series_summary	"Diversity and individual variability are essential to human cognitive function. Identifying the conserved and variable (epi)genomic signatures of the brain’s cellular components is critical for understanding the neurobiological basis of individual variation in brain function.  We applied single nucleus methylome and transcriptome sequence (snmCT-seq) to neurons from the frontal cortex of 11 adult human donors spanning a range of ages from 23 to 74, including males and females (Broadmann Area BA46). We clustered cells into brain cell types based on methylation features. We then examined the transcriptome and epigenome features in each cell type between and within individual donors. Taking advantage of the multimodal measurements in single cells, we also identified the relation between RNA expression and methylation level.These data with multiomics measu

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
if "molecule subtype: nuclear RNA + genomic DNA" in {3: ['molecule subtype: nuclear RNA + genomic DNA']}.get(3, []):
    is_gene_available = True

# Check availability and identify the keys for each variable
sample_characteristics = {
    0: ['brain region: Dorsolateral Prefrontal cortex (Brodmann area 46)'], 
    1: ['age: 23'], 
    2: ['Sex: Female'], 
    3: ['molecule subtype: nuclear RNA + genomic DNA']
}

# To identify the rows for 'Cognitive_Processing', 'age', and 'gender':
age_row = 1 if len(set(val.split(': ')[1] for val in sample_characteristics.get(1, []))) > 1 else None
gender_row = 2 if len(set(val.split(': ')[1].lower() for val in sample_characteristics.get(2, []))) > 1 else None
trait_row = None  # 'Cognitive_Processing' data is not provided in sample characteristics.

# Define functions for data conversion
def convert_age(value):
    try:
        return int(value.split(': ')[1])
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    try:
        val = value.split(': ')[1].lower()
        return 1 if val == 'male' else 0 if val == 'female' else None
    except IndexError:
        return None

convert_trait = None  # No data available for 'Cognitive_Processing'

# Save cohort information
save_cohort_info('GSE201248', './preprocessed/Cognitive_Processing/cohort_info.json', is_gene_available, trait_row is not None)

# Skip clinical feature extraction since trait_row is None
