In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lower_Grade_Glioma/GSE4058'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling reveals molecularly and clinically distinct subtypes of glioblastoma multiforme"
!Series_summary	"Glioblastoma multiforme (GBM) is the most common form of malignant glioma and is characterized by marked genetic instability, extensive intra-tumoral histopathological variability, and unpredictable variation in its clinical behavior. We investigated global gene expression in surgical samples of primary brain tumors. Gene expression profiling revealed large differences between normal brain samples and tumor tissues and between GBMs and lower grade oligodendroglial tumors. Extensive differences in gene expression were found among GBMs, particularly in genes involved in angiogenesis, immune cell infiltration, and extracellular matrix re-modeling. Significantly, we found that the global gene expression patterns in paired specimens from the same GBM invariably were more closely related to each other than to any other tumor, even 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True  # Based on the dataset information it contains gene expression data

# Review the sample characteristics dictionary to identify potential keys for 'Lower_Grade_Glioma', 'age', and 'gender'
sample_characteristics = {0: ['CRC']}  # Provided sample characteristics dictionary

# Based on the provided sample characteristics, the key '0' refers to a relevant trait, assuming it relates to the cancer type.
if 0 in sample_characteristics and len(set(sample_characteristics[0])) > 1:
    trait_row = 0

# Convert functions for each variable
def convert_trait(value):
    try:
        return int(value.split(':')[1])
    except:
        return None

def convert_age(value):
    try:
        return float(value.split(':')[1])
    except:
        return None

def convert_gender(value):
    gender = value.split(':')[1].strip().lower()
    if gender == 'male':
        return 1
    elif gender == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE4058', './preprocessed/Lower_Grade_Glioma/cohort_info.json', is_gene_available, trait_row is not None)
