In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Esophageal_Cancer/GSE164174'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A novel combination of serum microRNAs for the detection of early gastric cancer"
!Series_summary	"A serum miRNA combination could be a powerful classifier for the detection of patients with early stage gastric cancer."
!Series_overall_design	"Serum microRNA profiles of 2940 samples, which consist of 1423 gastric cancers, 1417 non-cancer controls, 50 esophageal cancers, and 50 colorectal cancers."
Sample Characteristics Dictionary:
{0: ['disease state: Gastric Cancer', 'disease state: Non-cancer B', 'disease state: Non-cancer A', 'disease state: Non-cancer C', 'disease state: Colorectal Cancer', 'disease state: Esophageal Cancer'], 1: ['Sex: Male', 'Sex: Female'], 2: ['age: 73', 'age: 59', 'age: 71', 'age: 60', 'age: 57', 'age: 47', 'age: 66', 'age: 74', 'age: 64', 'age: 52', 'age: 20', 'age: 70', 'age: 79', 'age: 61', 'age: 72', 'age: 75', 'age: 44', 'age: 62', 'age: 76', 'age: 78', 'age: 35', 'age: 49', 'age: 37', 'age: 55', 'age: 46', 'age: 56'

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

import numpy as np

# Check gene expression data availability
# From the background information, we know the study is about serum microRNAs, not gene expression
is_gene_available = False

# Determine the availability of variables
sample_characteristics_dict = {
    0: ['disease state: Gastric Cancer', 'disease state: Non-cancer B', 'disease state: Non-cancer A', 'disease state: Non-cancer C', 'disease state: Colorectal Cancer', 'disease state: Esophageal Cancer'],
    1: ['Sex: Male', 'Sex: Female'], 
    2: ['age: 73', 'age: 59', 'age: 71', 'age: 60', 'age: 57', 'age: 47', 'age: 66', 'age: 74', 'age: 64', 'age: 52', 'age: 20', 'age: 70', 'age: 79', 'age: 61', 'age: 72', 'age: 75', 'age: 44', 'age: 62', 'age: 76', 'age: 78', 'age: 35', 'age: 49', 'age: 37', 'age: 55', 'age: 46', 'age: 56', 'age: 36', 'age: 68', 'age: 51', 'age: 50'], 
    3: ['Stage: IA', 'Stage: IB', 'Stage: II', 'Stage: IIA', 'Stage: IIB', np.nan, 'Stage: I'], 
    4: ['histological subtype: diff', 'histological subtype: undiff', 'histological subtype: special', np.nan]
}

# 'Esophageal_Cancer' (trait)
if 'disease state: Esophageal Cancer' in sample_characteristics_dict[0]:
    trait_row = 0

# 'age'
if len(set(sample_characteristics_dict[2])) > 1:
    age_row = 2

# 'gender'
if len(set(sample_characteristics_dict[1])) == 2:
    gender_row = 1

# Define conversion functions
def convert_trait(value):
    try:
        _, val = value.split(": ")
        return 1 if val.strip().lower() == 'esophageal cancer' else 0
    except:
        return None

def convert_age(value):
    try:
        _, val = value.split(": ")
        return float(val.strip())
    except:
        return None

def convert_gender(value):
    try:
        _, val = value.split(": ")
        return 1 if val.strip().lower() == 'male' else 0
    except:
        return None

# Save metadata
save_cohort_info('GSE164174', './preprocessed/Esophageal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Esophageal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Esophageal_Cancer/trait_data/GSE164174.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4998853': [0.0, 73.0, 1.0], 'GSM4998854': [0.0, 59.0, 1.0], 'GSM4998855': [0.0, 71.0, 0.0], 'GSM4998856': [0.0, 60.0, 1.0], 'GSM4998857': [0.0, 57.0, 1.0], 'GSM4998858': [0.0, 47.0, 0.0], 'GSM4998859': [0.0, 66.0, 0.0], 'GSM4998860': [0.0, 74.0, 0.0], 'GSM4998861': [0.0, 64.0, 1.0], 'GSM4998862': [0.0, 52.0, 0.0], 'GSM4998863': [0.0, 64.0, 0.0], 'GSM4998864': [0.0, 20.0, 1.0], 'GSM4998865': [0.0, 70.0, 1.0], 'GSM4998866': [0.0, 71.0, 0.0], 'GSM4998867': [0.0, 60.0, 1.0], 'GSM4998868': [0.0, 79.0, 0.0], 'GSM4998869': [0.0, 61.0, 0.0], 'GSM4998870': [0.0, 72.0, 1.0], 'GSM4998871': [0.0, 75.0, 1.0], 'GSM4998872': [0.0, 66.0, 0.0], 'GSM4998873': [0.0, 64.0, 1.0], 'GSM4998874': [0.0, 44.0, 1.0], 'GSM4998875': [0.0, 62.0, 1.0], 'GSM4998876': [0.0, 72.0, 1.0], 'GSM4998877': [0.0, 76.0, 1.0], 'GSM4998878': [0.0, 52.0, 1.0], 'GSM4998879': [0.0, 78.0, 0.0], 'GSM4998880': [0.0, 35.0, 1.0], 'GSM4998881': [0.0, 66.0, 1.0], 'GSM4998882': [0.0, 49.0, 1.0], 'GSM4998883': [0.0, 79.0, 0.0], 'GSM499