In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Poor_Metabolizer_of_Drugs/GSE225292'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA transcriptome profiling in early-stage luminal breast cancer"
!Series_summary	"Oxysterols, oxidized derivatives of cholesterol, act in breast cancer (BC) as selective estrogen receptor modulators and affect cholesterol homeostasis, drug transport, nuclear and cell receptors, and other signaling proteins. Using overlapping data from patients with early-stage estrogen receptor-positive BC—high-coverage targeted DNA sequencing (99 patients, 113 genes), mRNA sequencing (67 patients), and full miRNome by microarrays (123 patients)—we describe complex mRNA-miRNA and miRNA-miRNA interaction (correlation) networks, with validation in two carefully curated public datasets (n=538 in total) and 11 databases. The ESR1-CH25H-INSIG1-ABCA9 axis was the most prominent, being interconnected through hsa-miR-125b-5p, but also hsa-miR-99a-5p, hsa-miR-100-5p, hsa miR 143 3p, hsa-199b-5p, hsa-miR-376a-3p, and hsa-miR-376c-3p. Mutations in SC5D, CYP46A1, and its f

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check the dataset’s availability of gene expression data
# Since the dataset title includes "miRNA transcriptome profiling," we do not have gene expression data
# is_gene_available remains False

# Variable availability and data type conversion functions
# Given sample characteristics dictionary, we conclude:
# - 'Poor_Metabolizer_of_Drugs' is not available
# - 'age' is not available
# - 'gender' appears to be listed under key 1: ['Sex: female']

gender_row = 1  # Identified as key for gender information

# Define convert_gender function for data type conversion (female to 0, male to 1)
def convert_gender(value):
    if isinstance(value, str):
        value = value.split(':')[-1].strip().lower()
        if value == 'female':
            return 0
        elif value == 'male':
            return 1
    return None

# Saving cohort information
save_cohort_info('GSE225292', './preprocessed/Poor_Metabolizer_of_Drugs/cohort_info.json', is_gene_available, False)
