In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lower_Grade_Glioma/GSE107850'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression based Intrinsic Glioma Subtypes are prognostic in low grade gliomas of the EORTC22033-26033 clinical trial."
!Series_summary	"Introduction: The EORTC22033-26033 clinical trial investigated whether initial temozolomide (TMZ) chemotherapy confers survival advantage compared to radiotherapy (RT) in low grade glioma patients. In this study we performed gene expression profiling on tissues from this trial in order to identify markers associated with progression free survival and treatment response in this well-defined cohort of patients."
!Series_summary	"Methods: Gene expression profiling, performed on 195 samples, was used to assign tumors to one of six intrinsic glioma subtypes (IGS; molecularly similar tumors predefined by unsupervised gene expression analysis) and to extract the cellular composition of immune infiltrates. DNA copy number changes were determined on samples assigned to IGS-16."
!Series_summary	"Results: We confirm that IG

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
is_gene_available = True  # Based on the provided descriptions, gene expression data is available.

# 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability
# Lower_Grade_Glioma information is embedded in the sample characteristic explanations, hence not directly available.
trait_row = None

# Age data is available and recorded under the key 1
age_row = 1

# Gender data is available and recorded under the key 0
gender_row = 0

# 2.3 Data Type Conversion
def convert_age(value):
    try:
        age = float(value.split(': ')[1])
        return age
    except:
        return None

def convert_gender(value):
    gender_str = value.split(': ')[1]
    if gender_str == 'Female':
        return 0
    elif gender_str == 'Male':
        return 1
    else:
        return None

# 3. Save Metadata
save_cohort_info('GSE107850', './preprocessed/Lower_Grade_Glioma/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
# Since trait_row is None, we skip this substep.
