In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lower_Grade_Glioma/GSE28271'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Isocitrate dehydrogenase 1 (IDH1) mutant gliomas demonstrate a distinct global CpG island methylation profile compared to IDH1 wildtype gliomas using MRSE"
!Series_summary	"In order to identify other molecular aberrations that may cooperate with IDH1R132MUT in gliomagenesis, we performed CpG-island methylation profiling analysis using MSRE (Tran et al. Front. Neurosci. 3:57. Doi: 10.3389/neuro.15.005.2009) on a subset of IDH1R132MUT and IDH1R132WT GBMs and found a distinct pattern of CpG island hypermethylation that was detected in all GBMs and lower grade gliomas with IDH1R132MUT. While absent from nearly all IDH1R132WT glioma, the methylation pattern in IDH1R132MUT GBMs shows similarity to the recently reported CpG island methylator phenotype (CIMP) found to be tightly associated with IDH1R132MUT gliomas(Noushmehr et al. Cancer Cell, Volume 17, Issue 5, 18 May 2010, Pages 510-522, ISSN 1535-6108, DOI: 10.1016/j.ccr.2010.03.017)."
!Series_overall

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = False  # Based on the provided information, this dataset contains methylation data only

# Sample characteristics dictionary provided in STEP 1
sample_characteristics = {
    0: ['tumor type: Anaplastic', 'tumor type: Glioblastoma', 'tumor type: Lowgrade'],
    1: ['restriction enzyme: HpaII'],
    2: ['idh1 status: MUT', 'idh1 status: WT']
}

# For 'Lower_Grade_Glioma' variable, we use 'idh1 status' (key 2) as a proxy
trait_row = 2

# Age and gender variables are not mentioned in the sample characteristics dictionary
age_row = None
gender_row = None

def convert_trait(value):
    # Logic to convert value: MUT to 1 and WT to 0 for 'idh1 status'
    status = value.split(':')[-1].strip()
    if status == 'MUT':
        return 1
    elif status == 'WT':
        return 0
    return None

def convert_age(value):
    # Conversion logic - age not available, returning None
    return None

def convert_gender(value):
    # Conversion logic - gender not available, returning None
    return None

# Save Metadata
save_cohort_info('GSE28271', './preprocessed/Lower_Grade_Glioma/cohort_info.json', is_gene_available, trait_row is not None)

# No clinical data to extract since age_row and gender_row are None
