In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Obstructive_sleep_apnea/GSE135917'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Subcutaneous fat transcriptome in obstructive sleep apnea and after treatment with CPAP"
!Series_summary	"Obstructive sleep apnea (OSA) has been linked to dysregulated metabolic states and treatment of sleep apnea may improve these conditions. Subcutaneous adipose tissue is a readily samplable fat depot that plays an important role in regulating metabolism. However, neither the pathophysiologic consequences of OSA nor the effects of continuous positive airway pressure (CPAP) in altering this compartment’s molecular pathways are understood. This study aimed to systematically identify subcutaneous adipose tissue transcriptional programs modulated in OSA and in response to its effective treatment with CPAP.  Two subject groups were investigated: Study Group 1 was comprised of 10 OSA and 8 controls; Study Group 2 included 24 individuals with OSA studied at baseline and following CPAP. For each subject, genome-wide gene expression measurement of subcut

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability 
# Series_description suggests it uses the GeneChip Human Gene 1.0 ST Affymetrix microarray, which is for gene expression
is_gene_available = True

# Identify the availability and keys of required variables
sample_characteristics = {0: ['age: 66', 'age: 53', 'age: 61', 'age: 30', 'age: 56', 'age: 47', 'age: 60', 'age: 63', 'age: 34', 'age: 52', 'age: 64', 'age: 55', 'age: 58', 'age: 70', 'age: 62', 'age: 50', 'age: 48', 'age: 68', 'age: 69.0', 'age: 33.9', 'age: 53.9', 'age: 51.4', 'age: 34.0', 'age: 67.2', 'age: 51.8', 'age: 57.5', 'age: 54.1', 'age: 47.9', 'age: 59.1', 'age: 56.6'], 1: ['Sex: F', 'Sex: M'], 2: ['bmi: 31.3', 'bmi: 35.2', 'bmi: 41.7', 'bmi: 40.3', 'bmi: 42.5', 'bmi: 26.7', 'bmi: 33.6', 'bmi: 30.4', 'bmi: 50.9', 'bmi: 28.8', 'bmi: 32.4', 'bmi: 32.7', 'bmi: 35.5', 'bmi: 39.1', 'bmi: 20.6', 'bmi: 50.3', 'bmi: 32', 'bmi: 32.2', 'bmi: 37.5', 'bmi: 37.7', 'bmi: 32.3', 'bmi: 43.1', 'bmi: 41.2', 'bmi: 59.4', 'bmi: 58.8', 'bmi: 39.8', 'bmi: 37.8', 'bmi: 36.4', 'bmi: 37.3', 'bmi: 61.0']}

# Check if the dataset provides OSA status info
trait_row = None  # No information available for 'Obstructive_sleep_apnea' in sample_characteristics

# Check if the dataset provides Age info
age_row = 0  # Age is available in row 0

# Check if the dataset provides Gender info
gender_row = 1  # Gender is available in row 1

# Define conversion functions
def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    val = value.split(': ')[1].strip().lower()
    if val == 'f':
        return 0
    elif val == 'm':
        return 1
    return None

# Save cohort information
save_cohort_info('GSE135917', './preprocessed/Obstructive_sleep_apnea/cohort_info.json', is_gene_available, trait_row is not None)

# Since trait_row is None, clinical feature extraction is not required
