In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Asthma/GSE178399'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Th2/Th1 cytokine imbalance is associated with higher COVID-19 risk mortality"
!Series_summary	"In this study we profiled 288 new serum proteomics samples measured at admission from patients hospitalized within the Mount Sinai Health System with positive SARS-CoV-2 infection. We first computed Th1 and Th2 pathway enrichment scores by gene set variation analysis and then compared the differences in Th2 and Th1 pathway scores between patients that died compared to those that survived."
!Series_overall_design	"We evaluated 288 new serum samples from hospitalized patients with a positive SARS-CoV-2 polymerase chain reaction (PCR) test in the Mount Sinai Health System from 03/01/20 to 06/07/20. Serum cytokines were profiled by Proseek Multiplex OLINK Proteomics as previously decribed. We computed enrichment scores for Th1 and Th2 immune pathways by gene set variation analysis, and modeled the Th2/Th1 balance as the difference between Th2 and Th1 enrichm

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
is_gene_available = False  # Based on the available data, it appears to be serum proteomics data

# Data Availability section
age_row = 0 if len(set([entry.split(': ')[1] for entry in clinical_data.get(0, [])])) > 1 else None
trait_row = 1 if len(set([entry.split(': ')[1] for entry in clinical_data.get(1, [])])) > 1 else None
gender_row = None  # No available key for gender data

# Data Type Conversion section
def convert_trait(value):
    try:
        val = value.split(': ')[1]
        return 1 if val == '1' else 0
    except Exception:
        return None

def convert_age(value):
    try:
        val = value.split(': ')[1]
        return float(val)
    except Exception:
        return None

def convert_gender(value):
    try:
        val = value.split(': ')[1]
        return 1 if val.lower() == 'male' else 0 if val.lower() == 'female' else None
    except Exception:
        return None

# Save cohort information
save_cohort_info('GSE178399', './preprocessed/Asthma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Asthma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Asthma/trait_data/GSE178399.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Asthma/cohort_info.json
