In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/COVID-19/GSE207945'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Evaluating the utility of proteomics for the identification of circulating pharmacodynamic biomarkers of IFNβ-1 biologics"
!Series_summary	"In this study, we evaluated the utility of proteomics to identify plasma proteins in healthy participants from a phase I clinical trial with IFNβ-1a and pegIFNβ-1a biologics to identify potential pharmacodynamic (PD) biomarkers. Using a linear mixed-effects model with repeated measurement for product-time interaction, we found that 248 and 528 analytes detected by the SOMAscan® assay were differentially expressed (p-value < 6.86E-06) between therapeutic doses of IFNβ-1a or pegIFNβ-1a, and placebo, respectively. We further prioritized signals based on peak change, area under the effect curve over the study duration, and overlap in signals from the two products. Analysis of prioritized datasets indicated activation of IFNB1 signaling and an IFNB signaling node with IL-6 as upstream regulators of the plasma prote

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = False  # proteomic data is not gene expression data

# Variable Availability and Data Type Conversion

# Define conversion functions
def convert_trait(value):
    try:
        _, val = value.split(': ')
        if val.lower() == 'covalescent plasma':
            return 1
        else:
            return 0
    except:
        return None

def convert_age(value):
    try:
        _, val = value.split(': ')
        return float(val)
    except:
        return None

def convert_gender(value):
    try:
        _, val = value.split(': ')
        if val.lower() == 'male':
            return 1
        elif val.lower() == 'female':
            return 0
        else:
            return None
    except:
        return None

# Analyzing Sample Characteristics Dictionary for Data Availability
# Inferred from output of step 1
sample_characteristics = {
    0: ['drug group: Covid-19 covalescent plasma', 'drug group: pegIFNBeta-1a', 'drug group: IFNBeta-1a', 'drug group: IFNBeta-1a_placebo', 'drug group: pegIFNBeta-1a_placebo'],
    1: ['delivery: .', 'delivery: SC', 'delivery: IM'],
    2: ['dose: .', 'dose: 125', 'dose: 30'],
    3: ['timepoint: .', 'timepoint: 0:00:00', 'timepoint: 3:00:00', 'timepoint: 8:00:00', 'timepoint: 16:00:00', 'timepoint: 32:00:00', 'timepoint: 48:00:00', 'timepoint: 72:00:00', 'timepoint: 96:00:00', 'timepoint: 120:00:00', 'timepoint: 144:00:00', 'timepoint: 216:00:00', 'timepoint: 312:00:00'],
}

# Extract keys from characteristics dictionary
trait_row = 0 if 'drug group: Covid-19 covalescent plasma' in sample_characteristics[0] else None
age_row = None  # Age data is not explicitly given in the provided sample characteristics
gender_row = None  # Gender data is not explicitly given in the provided sample characteristics

# Save cohort information
save_cohort_info('GSE207945', './preprocessed/COVID-19/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    # import the already available 'clinical_data'
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'COVID-19', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/COVID-19/trait_data/GSE207945.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6323672': [0], 'GSM6323673': [0], 'GSM6323674': [0], 'GSM6323675': [0], 'GSM6323676': [0], 'GSM6323677': [0], 'GSM6323678': [0], 'GSM6323679': [0], 'GSM6323680': [0], 'GSM6323681': [0], 'GSM6323682': [0], 'GSM6323683': [0], 'GSM6323684': [0], 'GSM6323685': [0], 'GSM6323686': [0], 'GSM6323687': [0], 'GSM6323688': [0], 'GSM6323689': [0], 'GSM6323690': [0], 'GSM6323691': [0], 'GSM6323692': [0], 'GSM6323693': [0], 'GSM6323694': [0], 'GSM6323695': [0], 'GSM6323696': [0], 'GSM6323697': [0], 'GSM6323698': [0], 'GSM6323699': [0], 'GSM6323700': [0], 'GSM6323701': [0], 'GSM6323702': [0], 'GSM6323703': [0], 'GSM6323704': [0], 'GSM6323705': [0], 'GSM6323706': [0], 'GSM6323707': [0], 'GSM6323708': [0], 'GSM6323709': [0], 'GSM6323710': [0], 'GSM6323711': [0], 'GSM6323712': [0], 'GSM6323713': [0], 'GSM6323714': [0], 'GSM6323715': [0], 'GSM6323716': [0], 'GSM6323717': [0], 'GSM6323718': [0], 'GSM6323719': [0], 'GSM6323720': [0], 'GSM6323721': [0], 'GSM6323722': [0], 'GSM6323723': [0], 'GSM6323724