In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rectal_Cancer/GSE138092'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"microRNA expression Data from cerebrospinal fluid of cancer patients"
!Series_summary	"Purpose: Leptomeningeal metastasis (LM) is a dismal terminal stage disease of solid cancer without definitive treatment. Both the limitation of cerebrospinal fluid (CSF) sample volume and a paucity of floating cancer cells are difficulties to study the genomic profiling of LM. As the profiling of microRNAs reflect the strategy and behavior of cancer cells to survive, and CSF is carrying micro-molecules from central nervous system (CNS), we evaluated the extracellular microRNA profiles of CSF from different CNS tumor status including LM.  Materials and Methods: We prospectively collected CSF from 65 patients of five groups of cancer control (CC), healthy control (HC), LM, brain metastasis (BM); and brain tumor (BT). Extracellular RNA was extracted from 2 mL of CSF after proper cell down, and preceded to small RNA microarray with Affymetrix miRNA 4.0 microarray ch

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Sample characteristics dictionary based on the provided output from STEP 1
Sample_Characteristics = {
    0: ['tissue: Cerebrospinal fluids'],
    1: ['age: 77', 'age: 61', 'age: 9', 'age: 3', 'age: 4', 'age: 78', 'age: 6', 'age: 52', 'age: 46', 'age: 2.4', 'age: 49', 
        'age: 54', 'age: 72', 'age: 63', 'age: 68', 'age: 50', 'age: 62', 'age: 55', 'age: 66', 'age: 36', 'age: 67', 
        'age: 56', 'age: 71', 'age: 69', 'age: 57', 'age: 33', 'age: 47', 'age: 44', 'age: 42', 'age: 58']
}

# 1. Gene Expression Data Availability
# From the data, it is evident that this dataset contains miRNA data, not gene expression data directly.
is_gene_available = False

# 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability for 'Rectal_Cancer'
# There is no explicit reference to 'Rectal_Cancer', and the summary points to other cancer types.
trait_row = None

# 2.1 Data Availability for 'age'
# The age data is found under key 1
if len(set([entry.split(': ')[1] for entry in Sample_Characteristics[1]])) > 1:
    age_row = 1
else:
    age_row = None

# 2.1 Data Availability for 'gender'
# The provided data does not explicitly mention gender.
gender_row = None

# 2.3 Data Type Conversion Functions

# Convert Trait Function (Not Available in this dataset)
def convert_trait(value):
    return None

# Convert Age Function
def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except (ValueError, IndexError):
        return None

# Convert Gender Function (Not Available in this dataset)
def convert_gender(value):
    return None

# 3. Save Metadata
save_cohort_info('GSE138092', './preprocessed/Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction (Skipped as trait_row is None)
# No further code necessary here
