In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Esophageal_Cancer/GSE122497'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Large-scale serum microRNA profiling for developing an esophageal squamous cell carcinoma detection model"
!Series_summary	"A serum miRNA combination could be a powerful classifier for the detection of esophageal squamous cell carcinoma."
!Series_overall_design	"Serum microRNA profiles of 5531 samples, which consist of 566 of esophageal squamous cell carcinoma and 4965 of non-cancer controls."
Sample Characteristics Dictionary:
{0: ['tissue: Serum'], 1: ['disease status: Esophageal cancer', 'disease status: Non-cancer 3', 'disease status: Non-cancer 2', 'disease status: Non-cancer 1'], 2: ['Sex: Female', 'Sex: Male'], 3: ['age: 57', 'age: 73', 'age: 72', 'age: 56', 'age: 60', 'age: 71', 'age: 65', 'age: 64', 'age: 77', 'age: 68', 'age: 52', 'age: 86', 'age: 76', 'age: 75', 'age: 69', 'age: 79', 'age: 61', 'age: 66', 'age: 59', 'age: 63', 'age: 78', 'age: 82', 'age: 37', 'age: 62', 'age: 67', 'age: 74', 'age: 81', 'age: 70', 'age: 49', 'age: 45'], 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable


### Step 2.1 Data Availability: Unknown Step

In [4]:
is_gene_available = False  # The dataset is related to miRNA profiling, per the given series title and summary.

# Disease Status: Key 1
if len(set([item.split(': ')[1] for item in sample_characteristics_dict[1]])) > 1:
    trait_row = 1

# Age: Key 3
if len(set([item.split(': ')[1] for item in sample_characteristics_dict[3]])) > 1:
    age_row = 3

# Gender: Key 2
if len(set([item.split(': ')[1] for item in sample_characteristics_dict[2]])) > 1:
    gender_row = 2


### Step 2.3 Data Type Conversion: Unknown Step

In [5]:
def convert_trait(value):
    try:
        status = value.split(': ')[1]
        if status == 'Esophageal cancer':
            return 1
        elif 'Non-cancer' in status:
            return 0
    except IndexError:
        pass
    return None

def convert_age(value):
    try:
        return int(value.split(': ')[1])
    except (IndexError, ValueError):
        return None

def convert_gender(value):
    try:
        gender = value.split(': ')[1]
        if gender == 'Female':
            return 0
        elif gender == 'Male':
            return 1
    except IndexError:
        pass
    return None


### Step 3 Save Metadata: Unknown Step

In [6]:
save_cohort_info('GSE122497', './preprocessed/Esophageal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)


### Step 4 Clinical Feature Extraction: Unknown Step

In [7]:
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Esophageal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Esophageal_Cancer/trait_data/GSE122497.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3467413': [1, 57, 0], 'GSM3467414': [1, 73, 1], 'GSM3467415': [1, 72, 1], 'GSM3467416': [1, 73, 1], 'GSM3467417': [1, 56, 1], 'GSM3467418': [1, 60, 1], 'GSM3467419': [1, 71, 1], 'GSM3467420': [1, 65, 1], 'GSM3467421': [1, 64, 1], 'GSM3467422': [1, 72, 1], 'GSM3467423': [1, 64, 1], 'GSM3467424': [1, 71, 1], 'GSM3467425': [1, 77, 1], 'GSM3467426': [1, 68, 1], 'GSM3467427': [1, 52, 0], 'GSM3467428': [1, 86, 1], 'GSM3467429': [1, 76, 1], 'GSM3467430': [1, 75, 1], 'GSM3467431': [1, 72, 1], 'GSM3467432': [1, 69, 1], 'GSM3467433': [1, 79, 1], 'GSM3467434': [1, 75, 1], 'GSM3467435': [1, 69, 1], 'GSM3467436': [1, 64, 0], 'GSM3467437': [1, 61, 1], 'GSM3467438': [1, 66, 1], 'GSM3467439': [1, 59, 1], 'GSM3467440': [1, 68, 1], 'GSM3467441': [1, 69, 1], 'GSM3467442': [1, 59, 1], 'GSM3467443': [1, 63, 1], 'GSM3467444': [1, 72, 0], 'GSM3467445': [1, 71, 1], 'GSM3467446': [1, 78, 1], 'GSM3467447': [1, 75, 1], 'GSM3467448': [1, 75, 0], 'GSM3467449': [1, 82, 1], 'GSM3467450': [1, 57, 1], 'GSM3467451