In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Osteoporosis/GSE80614'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression analyses of the differentiating hMSC into Osteoblasts and Adipocytes."
!Series_summary	"Age-related skeletal degeneration in patients with osteoporosis is characterized by decreased bone mass and occurs concomitant with an increase in bone marrow adipocytes. Using microarray expression profiling with high temporal resolution, we identified gene regulatory events in early stages of osteogenic and adipogenic lineage commitment of human mesenchymal stromal cells (hMSCs). Data analysis reveal three distinct phases when cells adopt a committed expression phenotype: initiation of differentiation (0-3h, Phase I), lineage-acquisition (6-24h, Phase II) and early lineage-progression (48-96h, Phase III). Upstream regulator analysis identifies 34 transcription factors (TFs) in Phase I with a role in hMSC differentiation. Interestingly, expression levels of identified TFs did not always change and indicate additional post-transcriptional regula

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check Gene Expression Data Availability
is_gene_available = True  # Based on dataset background, it mentions microarray expression profiling, indicating gene expression data.

# Step 2: Variable Availability and Data Type Conversion
# Checking the sample characteristics dictionary for the required variables

# For 'Osteoporosis': Not explicitly available in the sample characteristics, so set to None.
trait_row = None

# For 'age': Available under key 1 with values like 'age: 19 years'.
age_row = 1

# For 'gender': Available under key 0 with values like 'gender: Male'.
gender_row = 0

# Define the conversion functions
def convert_age(value):
    try:
        age_value = int(value.split(': ')[1].replace(' years', ''))
        return age_value
    except ValueError:
        return None

def convert_gender(value):
    gender_str = value.split(': ')[1].strip().lower()
    if gender_str == 'male':
        return 1
    elif gender_str == 'female':
        return 0
    else:
        return None

# Osteoporosis variable is not available, so no conversion function.
convert_trait = None

# Step 3: Save Cohort Information
save_cohort_info('GSE80614', './preprocessed/Osteoporosis/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction (Skip this step since trait_row is None)
# Note: This step would be conditional on trait_row being not None
