In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Esophageal_Cancer/GSE113740'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"High-sensitive circulating microRNA panel for early detection of hepatocellular carcinoma"
!Series_summary	"A serum miRNA combination could be a powerful classifier for the detection of hepatocellular carcinoma."
!Series_summary	""
!Series_summary	"Keywords: Non-coding RNA profiling by array"
!Series_overall_design	"Serum microRNA profiles of 1817 samples, which consist of 345 with hepatocellular carcinoma,  46 patients with chronic hepatitis, 93 patients with liver cirrhosis, and 1033 non-cancer individuals."
Sample Characteristics Dictionary:
{0: ['tissue: Serum', 'tissue: serum'], 1: ['Sex: Male', 'Sex: Female', 'Sex: F', 'Sex: M', 'Sex: unknown'], 2: ['age: 76', 'age: 80', 'age: 56', 'age: 58', 'age: 85', 'age: 66', 'age: 71', 'age: 67', 'age: 87', 'age: 90', 'age: 73', 'age: 63', 'age: 65', 'age: 57', 'age: 70', 'age: 82', 'age: 68', 'age: 79', 'age: 64', 'age: 69', 'age: 59', 'age: 51', 'age: 61', 'age: 53', 'age: 77', 'age: 47', 'age: 44', 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# Based on the background information, this dataset is focused on serum microRNA profiles, not gene expression.
is_gene_available = False  # gene expression data is not available

# 2. Variable Availability and Data Type Conversion

# For 'Esophageal_Cancer':
trait_row = 3  # Key with unique disease status values containing 'Esophageal Cancer'
def convert_trait(value):
    try:
        val = value.split(':')[1].strip().lower()
        if val == 'esophageal cancer':
            return 1
        else:
            return 0
    except:
        return None

# For 'age':
age_row = 2  # Key with unique age-related values
def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

# For 'gender':
gender_row = 1  # Key with unique gender-related values
def convert_gender(value):
    try:
        val = value.split(':')[1].strip().lower()
        if val in ['male', 'm']:
            return 1
        elif val in ['female', 'f']:
            return 0
        else:
            return None
    except:
        return None

# 3. Save Metadata
save_cohort_info('GSE113740', './preprocessed/Esophageal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
# Since trait_row is not None, this step is performed
selected_clinical_data = geo_select_clinical_features(clinical_data, 'Esophageal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Esophageal_Cancer/trait_data/GSE113740.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM3106850': [0.0, 76.0, 1.0], 'GSM3106917': [0.0, 80.0, 0.0], 'GSM3106951': [0.0, 56.0, 0.0], 'GSM3106955': [0.0, 58.0, 1.0], 'GSM3106966': [0.0, 85.0, 1.0], 'GSM3106969': [0.0, 66.0, 1.0], 'GSM3106977': [0.0, 71.0, 1.0], 'GSM3106981': [0.0, 67.0, 1.0], 'GSM3107048': [0.0, 87.0, 1.0], 'GSM3107072': [0.0, 90.0, 1.0], 'GSM3107083': [0.0, 73.0, 1.0], 'GSM3107094': [0.0, 63.0, 1.0], 'GSM3107104': [0.0, 76.0, 1.0], 'GSM3107120': [0.0, 65.0, 1.0], 'GSM3107138': [0.0, 65.0, 0.0], 'GSM3107140': [0.0, 57.0, 1.0], 'GSM3107154': [0.0, 70.0, 0.0], 'GSM3107163': [0.0, 82.0, 1.0], 'GSM3107172': [0.0, 68.0, 0.0], 'GSM3107175': [0.0, 56.0, 1.0], 'GSM3107187': [0.0, 79.0, 1.0], 'GSM3107198': [0.0, 65.0, 0.0], 'GSM3107213': [0.0, 64.0, 1.0], 'GSM3107228': [0.0, 71.0, 1.0], 'GSM3107232': [0.0, 65.0, 0.0], 'GSM3107248': [0.0, 69.0, 0.0], 'GSM3107255': [0.0, 68.0, 1.0], 'GSM3107257': [0.0, 59.0, 1.0], 'GSM3107277': [0.0, 51.0, 1.0], 'GSM3107326': [0.0, 73.0, 0.0], 'GSM3107336': [0.0, 68.0, 1.0], 'GSM310