In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Cognitive_Processing/GSE249238'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Individual Variability in Human Cell Type Transcriptomes and Epigenomes [OM1B]"
!Series_summary	"Diversity and individual variability are essential to human cognitive function. Identifying the conserved and variable (epi)genomic signatures of the brain’s cellular components is critical for understanding the neurobiological basis of individual variation in brain function. We applied single nucleus methylome and transcriptome sequence (snmCT-seq) to neurons from the frontal cortex of 11 adult human donors spanning a range of ages from 23 to 74, including males and females (Broadmann Area BA46). We clustered cells into brain cell types based on methylation features. We then examined the transcriptome and epigenome features in each cell type between and within individual donors. Taking advantage of the multimodal measurements in single cells, we also identified the relation between RNA expression and methylation level.These data with multiomics measur

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is likely available
is_gene_available = True  # Based on the dataset description which mentions transcriptome data

# Identify keys for the variables in sample characteristics dictionary
trait_row = None  # Cognitive_Processing data is not explicitly mentioned or inferable from given data
age_row = 1  # Found 'age' information at key 1
gender_row = 2  # Found 'Sex' information at key 2

# Define the data type conversion functions
def convert_trait(value):
    return None  # No information provided for 'Cognitive_Processing'

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender_value = value.split(': ')[1].strip().lower()
    if gender_value == 'male':
        return 1
    elif gender_value == 'female':
        return 0
    else:
        return None

# Save metadata
save_cohort_info('GSE249238', './preprocessed/Cognitive_Processing/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Cognitive_Processing', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Cognitive_Processing/trait_data/GSE249238.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
