In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Telomere_Length/GSE52237'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Smoking accelerated aging of the small airway epithelium"
!Series_summary	"Aging involves multiple biologically complex processes characterized by a decline in cellular homeostasis over time leading to a loss and impairment of physiological integrity and function. Specific cellular hallmarks of aging include abnormal gene expression patterns, shortened telomeres and associated biological dysfunction. Like all organs, the lung demonstrates both physiological and structural changes with age that result in a progressive decrease in lung function in healthy individuals. Cigarette smoking accelerates lung function decline over time, suggesting smoking accelerates aging of the lung. Based on this data, we hypothesized that cigarette smoking accelerates the aging of the small airway epithelium, the cells that take the initial brunt of inhaled toxins from the cigarette smoke and one of the primary sites of pathology associated with cigarette smoking. Usin

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # For this example, we assume gene expression data is available.

# Determine data availability for the variables
sample_characteristics = {
    0: ['smoking status: S', 'smoking status: nonsmoker', 'smoking status: smoker'], 
    1: [float('nan'), 'cilia length: 7.7322', 'cilia length: 8.07239', 'cilia length: 6.15053', 'cilia length: 7.19703', 
        'cilia length: 7.40795', 'cilia length: 6.85311', 'cilia length: 7.07429', 'cilia length: 7.27673', 
        'cilia length: 7.78135', 'cilia length: 6.78668', 'cilia length: 7.01968', 'cilia length: 5.85311', 
        'cilia length: 5.94677', 'cilia length: 6.45982', 'cilia length: 7.16176', 'cilia length: 6.02311', 
        'cilia length: 7.84439', 'cilia length: 7.32101', 'cilia length: 7.25948', 'cilia length: 7.08977', 
        'cilia length: 7.75275', 'cilia length: 8.57827', 'cilia length: 6.64001', 'cilia length: 6.47906', 
        'cilia length: 7.28888', 'cilia length: 6.29608', 'cilia length: 6.34788', 'cilia length: 6.59999', 
        'cilia length: 6.45331']
}

# Telomere_Length data availability
for key, values in sample_characteristics.items():
    if any("telomere length" in str(val).lower() for val in values):
        trait_row = key
        break
else:
    trait_row = None

# Age data availability (if present)
for key, values in sample_characteristics.items():
    if any("age" in str(val).lower() for val in values):
        age_row = key
        break
else:
    age_row = None

# Gender data availability (if everything is male, the above document only mentions males as an independent cohort)
for key, values in sample_characteristics.items():
    if any("gender" in str(val).lower() for val in values):
        gender_row = key
        break
else:
    gender_row = None  # Data unavailable, assuming entire cohort is male

# Data type conversion functions
def convert_trait(value):
    try:
        # Extract value after colon
        extracted_value = value.split(':')[1].strip()
        return float(extracted_value)
    except:
        return None

def convert_age(value):
    try:
        # Extract value after colon
        extracted_value = value.split(':')[1].strip()
        return float(extracted_value)
    except:
        return None

def convert_gender(value):
    try:
        # Assuming all male if gender information is not differentiated
        return 1
    except:
        return None

# Save cohort information
save_cohort_info('GSE52237', './preprocessed/Telomere_Length/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if applicable
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Telomere_Length', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Telomere_Length/trait_data/GSE52237.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Telomere_Length/cohort_info.json
