In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Osteoporosis/GSE152073'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression data from Brazilian SPAH study"
!Series_summary	"This study is part of previous epidemiologic project, including a population-based survey (Sao Paulo Ageing & Health study (SPAH Study). The data from this study was collected between 2015 to 2016 and involved elderly women (ages ≥65 yeas) living in the Butanta district, Sao Paulo. The purpose of the study was identification of association between transcriptome and the osteo metabolism diseases phenotype, like osteoporosis, vertebral fracture and coronary calcification."
!Series_summary	"Peripheral blood cells suffer alterations in the gene expression pattern in response to perturbations caused by calcium metabolism diseases.   The purpose of this study is to identify possible molecular markers associated with osteoporosis, vertebral fractures and coronary calcification in elderly women from community from Brazilian SPAH study. Vertebral fractures were the most common clinical manife

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset likely contains gene expression data
series_summary = (
    "This study is part of previous epidemiologic project, including a population-based survey "
    "(Sao Paulo Ageing & Health study (SPAH Study). The data from this study was collected between "
    "2015 to 2016 and involved elderly women (ages ≥65 yeas) living in the Butanta district, Sao Paulo. "
    "The purpose of the study was identification of association between transcriptome and the osteo metabolism "
    "diseases phenotype, like osteoporosis, vertebral fracture and coronary calcification.\nPeripheral blood "
    "cells suffer alterations in the gene expression pattern in response to perturbations caused by calcium metabolism diseases. "
    "The purpose of this study is to identify possible molecular markers associated with osteoporosis, vertebral fractures and coronary calcification "
    "in elderly women from the community from Brazilian SPAH study. Vertebral fractures were the most common clinical manifestation of osteoporosis and coronary "
    "calcifications were associated with high morbimortality.\nFasting blood samples were withdrawn from community elderly women with osteo metabolism diseases. "
    "RNA was extracted from peripheral total blood, and hybridized into Affymetrix microarrays."
)

is_gene_available = "RNA was extracted from peripheral total blood, and hybridized into Affymetrix microarrays" in series_summary

# Check for variable availability in the sample characteristics dictionary
sample_characteristics = {
    0: ['gender: female'], 
    1: ['age (years): 76', 'age (years): 77', 'age (years): 75', 'age (years): 80', 'age (years): 82', 
        'age (years): 83', 'age (years): 78', 'age (years): 74', 'age (years): 81', 'age (years): 91', 
        'age (years): 79', 'age (years): 88', 'age (years): 87', 'age (years): 86', 'age (years): 70', 
        'age (years): 85', 'age (years): 73', 'age (years): 84'], 
    2: [None, 'height (cm): 153']
}

# Gender
gender_row = 0 if sample_characteristics.get(0) and len(sample_characteristics[0]) > 1 else None

def convert_gender(value):
    try:
        value = value.split(":")[1].strip().lower()
        return 1 if value == "male" else 0 if value == "female" else None
    except:
        return None

gender_row = 0  # The gender is stored under key 0


# Age
age_row = 1 if sample_characteristics.get(1) and len(set(sample_characteristics[1])) > 1 else None

def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except:
        return None

age_row = 1  # The age is stored under key 1

# Osteoporosis (trait)
trait_row = None  # Not directly present

# Save cohort info
save_cohort_info('GSE152073', './preprocessed/Osteoporosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction: Skip because trait_row is None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Osteoporosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Osteoporosis/trait_data/GSE152073.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
