In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lung_Cancer/GSE248830'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Unlocking Molecular mechanisms and identifying druggable targets in matched-paired brain metastasis of Breast and Lung cancers "
!Series_summary	"Introduction: The incidence of brain metastases in cancer patients is increasing, with lung and breast cancer being the most common sources. Despite advancements in targeted therapies, the prognosis remains poor, highlighting the importance to investigate the underlying mechanisms in brain metastases. The aim of this study was to investigate the differences in the molecular mechanisms involved in brain metastasis of breast and lung cancers. In addition, we aimed to identify cancer lineage-specific druggable targets in the brain metastasis. Methods: To that aim, a cohort of 44 FFPE tissue samples, including 22 breast cancer and 22 lung adenocarcinoma (LUAD) and their matched-paired brain metastases were collected. Targeted gene expression profiles of primary tumors were compared to their matched-paired br

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is likely available.
is_gene_available = True

# Determine the availability and keys for each variable by inspecting the sample characteristics dictionary.
trait_row = None  # There is no direct mention of Lung_Cancer trait in the sample characteristics.
age_row = 0  # Age data identified under key 0.
gender_row = 1  # Gender data identified under key 1.

# Define conversion functions based on the data available.

# Convert trait: No conversion function needed as trait_row is None.
def convert_trait(trait_value):
    return None

# Convert age to continuous data type.
def convert_age(age_value):
    try:
        value = age_value.split(': ')[1]
        return float(value) if value.isnumeric() else None
    except (IndexError, ValueError):
        return None

# Convert gender to binary data type: female to 0, male to 1.
def convert_gender(gender_value):
    try:
        value = gender_value.split(': ')[1].lower()
        if value == 'female':
            return 0
        elif value == 'male':
            return 1
        else:
            return None
    except IndexError:
        return None

# Save cohort information.
save_cohort_info('GSE248830', './preprocessed/Lung_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Since trait_row is None, we skip the clinical feature extraction.


A new JSON file was created at: ./preprocessed/Lung_Cancer/cohort_info.json
