In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Epilepsy/GSE156374'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"DNA methylation and copy number profiling in polymicrogyria"
!Series_summary	"Polymicrogyria (PMG) is a developmental cortical malformation characterized by an excess of small and frustrane gyration and abnormal cortical lamination. PMG frequently associates with seizures. The molecular pathomechanisms underlying PMG development are not yet understood. About 40 genes have been associated with PMG, and small copy number variations have also been described in selected patients. We recently provided evidence that epilepsy-associated structural brain lesions can be classified based on genomic DNA methylation patterns. Here we analyzed 27 PMG patients employing array-based DNA-methylation profiling on formalin-fixed paraffin-embedded material. A series of 63 well-characterized non-PMG cortical malformations (focal cortical dysplasia type 2a/b and hemimegalencephaly), temporal lobe epilepsy, and non-epilepsy autopsy controls was used as reference cohort

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = False  # Based on the provided output, the dataset is focused on DNA methylation and not gene expression

# Evaluating data availability for variables
# For Epilepsy, age, and gender availability
# Sample Characteristics Dictionary:
# {0: ['tissue: Cortex', 'tissue: White Matter'], 1: ['diagnosis: PMG_1q', 'diagnosis: PMG', 'diagnosis: CTRL - NCx', 'diagnosis: CTRL - WM', 'diagnosis: FCD 2A', 'diagnosis: FCD 2B', 'diagnosis: HME', 'diagnosis: TLE']}

# From the background, it seems epilepsy information is likely embedded in the diagnosis.
# Therefore, we will treat diagnosis as our `trait_row`.

trait_row = 1
age_row = gender_row = None  # Age and gender information is not available in the provided characteristics dictionary.

def convert_trait(value):
    mapping = {
        'CTRL - NCx': 0,
        'CTRL - WM': 0,
        'PMG_1q': 1,
        'PMG': 1,
        'FCD 2A': 1,  # Assuming these conditions are associated with epilepsy
        'FCD 2B': 1,
        'HME': 1,
        'TLE': 1,
    }
    key = value.split(': ')[1]
    return mapping.get(key, None)

# Age and Gender data not available
convert_age = lambda value: None
convert_gender = lambda value: None

# Save cohort info
save_cohort_info('GSE156374', './preprocessed/Epilepsy/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Epilepsy', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Epilepsy/trait_data/GSE156374.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4729724': [1], 'GSM4729725': [1], 'GSM4729726': [1], 'GSM4729727': [1], 'GSM4729728': [1], 'GSM4729729': [1], 'GSM4729730': [1], 'GSM4729731': [1], 'GSM4729732': [1], 'GSM4729733': [1], 'GSM4729734': [1], 'GSM4729735': [1], 'GSM4729736': [1], 'GSM4729737': [1], 'GSM4729738': [1], 'GSM4729739': [1], 'GSM4729740': [1], 'GSM4729741': [1], 'GSM4729742': [1], 'GSM4729743': [1], 'GSM4729744': [1], 'GSM4729745': [1], 'GSM4729746': [1], 'GSM4729747': [1], 'GSM4729748': [1], 'GSM4729749': [1], 'GSM4729750': [0], 'GSM4729751': [0], 'GSM4729752': [0], 'GSM4729753': [0], 'GSM4729754': [0], 'GSM4729755': [0], 'GSM4729756': [0], 'GSM4729757': [0], 'GSM4729758': [0], 'GSM4729759': [0], 'GSM4729760': [0], 'GSM4729761': [0], 'GSM4729762': [0], 'GSM4729763': [1], 'GSM4729764': [1], 'GSM4729765': [1], 'GSM4729766': [1], 'GSM4729767': [1], 'GSM4729768': [1], 'GSM4729769': [1], 'GSM4729770': [1], 'GSM4729771': [1], 'GSM4729772': [1], 'GSM4729773': [1], 'GSM4729774': [1], 'GSM4729775': [1], 'GSM4729776