In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Osteoporosis/GSE51495'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Peripheral blood mononuclear cell- and cortical bone-derived transcriptional profiles"
!Series_summary	"Large-scale transcriptional profiling has enormous potential for discovery of osteoporosis susceptibility genes and for identification of the molecular mechanisms by which these genes and associated pathways regulate bone maintenance and turnover. A potential challenge in the use of this method for the discovery of osteoporosis genes is the difficulty of obtaining bone tissue samples from large numbers of individuals. In this study, we tested the applicability of using peripheral blood mononuclear cell (PBMC)-derived transcriptional profiles as a surrogate to cortical bone transcriptional profiles to address questions of skeletal genetics. We used a well-established and genetically well-characterized nonhuman primate model for human bone maintenance and turnover. We determined that a high degree of overlap exists in gene expression of cortical b

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
is_gene_available = True  # Since the dataset mentions transcriptional profiling

# Availability and mapping of keys
trait_row = None  # There is no explicit mention of osteoporosis variable in the sample characteristics
age_row = 1      # This is where age values are listed
gender_row = 0   # This is where gender values are listed

# Conversion functions
def convert_trait(value):
    return None  # No osteoporosis trait data available

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender_map = {"female": 0, "male": 1}
    try:
        return gender_map[value.split(': ')[1].strip().lower()]
    except:
        return None

# Save cohort information
save_cohort_info('GSE51495', './preprocessed/Osteoporosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if trait_row is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Osteoporosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Osteoporosis/trait_data/GSE51495.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
