In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Osteoporosis/GSE35925'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Calcitriol supplementation effects on Ki67 expression and transcriptional profile of breast cancer specimens from post-menopausal patients"
!Series_summary	"Background: Breast cancer patients present lower 1,25(OH)2D3 or 25(OH)D3 serum levels than unaffected women. Although 1,25(OH)2D3 pharmacological concentrations of 1,25(OH)2D3 may exert antiproliferative effects in breast cancer cell lines, much uncertainty remains about the effects of calcitriol supplementation in tumor specimens in vivo. We have evaluated tumor dimension (ultrassonography), proliferative index (Ki67 expression), 25(OH)D3 serum concentration and gene expression profile, before and after a short term calcitriol supplementation (dose to prevent osteoporosis) to post-menopausal patients. Results: Thirty three patients with operable disease had tumor samples evaluated. Most of them (87.5%) presented 25(OH)D3 insufficiency (<30 ng/mL). Median period of calcitriol supplementation w

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True  # Based on the presence of gene expression analysis mentioned in background.
trait_row = None
age_row = 1
gender_row = 0

# Define conversion functions
def convert_trait(value):
    if 'calcitriol' in value.lower():  # Assuming calcitriol evidence indicates osteoporosis prevention
        return 1
    return None

def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    raw_value = value.split(':')[1].strip().lower()
    if raw_value == 'female':
        return 0
    elif raw_value == 'male':
        return 1
    else:
        return None

save_cohort_info('GSE35925', './preprocessed/Osteoporosis/cohort_info.json', is_gene_available, trait_row is not None)

# Extract Clinical Information if trait_row is identified
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Osteoporosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Osteoporosis/trait_data/GSE35925.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
