In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Obstructive_sleep_apnea/GSE49800'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Effects of CPAP Therapy on Leukocyte Gene Expression"
!Series_summary	"Rationale: Obstructive sleep apnea (OSA) has been associated with a number of chronic disorders that may improve with effective therapy. However, the molecular pathways affected by continuous positive airway pressure (CPAP) treatment are largely unknown.  We sought to assess the system-wide consequences of CPAP therapy by transcriptionally profiling peripheral blood leukocytes (PBLs).  Methods: Subjects diagnosed with severe OSA were treated with CPAP, and whole-genome expression measurement of PBLs was performed at baseline and following therapy. We used Gene Set Enrichment Analysis (GSEA) to identify gene sets that were differentially enriched. Network analysis was then applied to identify key drivers of pathways influenced by CPAP.  Results: 18 subjects with severe OSA (apnea hypopnea index ≥ 30 events/hour) underwent CPAP therapy and microarray analysis of their PBLs.  Trea

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
if "!Series_title" in ["Effects of CPAP Therapy on Leukocyte Gene Expression"]:
    is_gene_available = True

# Identifying the availability of variables
sample_dict = {0: ['subject: 1', 'subject: 2', 'subject: 3', 'subject: 4', 'subject: 5', 'subject: 6', 'subject: 7', 'subject: 8', 'subject: 9', 
                   'subject: 10', 'subject: 11', 'subject: 12', 'subject: 13', 'subject: 14', 'subject: 15', 'subject: 16', 'subject: 17', 
                   'subject: 18'], 
                1: ['treatment: none, baseline', 'treatment: CPAP']}

# Check and set trait_row, age_row, gender_row
if 1 in sample_dict:  # 'treatment' fits the context of Obstructive_sleep_apnea
    trait_row = 1

# Explicit 'age' and 'gender' data in the dictionary are not available
# Assuming age and gender data are not available since it is not mentioned

# Define conversion functions
def convert_trait(value):
    if 'baseline' in value:
        return 0
    elif 'CPAP' in value:
        return 1
    else:
        return None

def convert_age(value):
    # As age is not provided in this example, returning None
    return None

def convert_gender(value):
    # As gender is not provided in this example, returning None
    return None

# Save metadata
save_cohort_info('GSE49800', './preprocessed/Obstructive_sleep_apnea/cohort_info.json', is_gene_available, trait_row is not None)

# If clinical data is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Obstructive_sleep_apnea', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Obstructive_sleep_apnea/trait_data/GSE49800.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1207208': [0], 'GSM1207209': [1], 'GSM1207210': [0], 'GSM1207211': [1], 'GSM1207212': [0], 'GSM1207213': [1], 'GSM1207214': [0], 'GSM1207215': [1], 'GSM1207216': [0], 'GSM1207217': [1], 'GSM1207218': [0], 'GSM1207219': [1], 'GSM1207220': [0], 'GSM1207221': [1], 'GSM1207222': [0], 'GSM1207223': [1], 'GSM1207224': [0], 'GSM1207225': [1], 'GSM1207226': [0], 'GSM1207227': [1], 'GSM1207228': [0], 'GSM1207229': [1], 'GSM1207230': [0], 'GSM1207231': [1], 'GSM1207232': [0], 'GSM1207233': [1], 'GSM1207234': [0], 'GSM1207235': [1], 'GSM1207236': [0], 'GSM1207237': [1], 'GSM1207238': [0], 'GSM1207239': [1], 'GSM1207240': [0], 'GSM1207241': [1], 'GSM1207242': [0], 'GSM1207243': [1]}
