In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Metabolic_Rate/GSE106800'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Circadian misalignment induces fatty acid metabolism gene profiles and induces insulin resistance in human skeletal muscle."
!Series_summary	"Circadian misalignment, such as in shift work, has been associated with obesity and type 2 diabetes, however, direct effects of circadian misalignment on skeletal muscle insulin sensitivity and muscle molecular circadian clock have never been investigated in humans. Here we investigated insulin sensitivity and muscle metabolism in fourteen healthy young lean men (age 22.4 ± 2.8 years; BMI 22.3 ± 2.1 kg/m2 [mean ± SD]) after a 3-day control protocol and a 3.5-day misalignment protocol induced by a 12-h rapid shift of the behavioral cycle. We show that circadian misalignment results in a significant decrease in peripheral insulin sensitivity due to a reduced skeletal muscle non-oxidative glucose disposal (Rate of disappearance: 23.7 ± 2.4 vs. 18.4 ± 1.4 mg/kg/min; control vs. misalignment; p=0.024). Fasting gl

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # From "Microarray analysis" in the summary

# Variable availability and data type conversion
trait_row = None  # 'Metabolic_Rate' data not explicitly mentioned
age_row = 2
gender_row = 0

def convert_trait(value):
    """
    Converts a string value into a continuous variable for trait 'Metabolic_Rate'.
    """
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

def convert_age(value):
    """
    Converts a string value into a continuous variable for age.
    """
    try:
        return int(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    """
    Converts a string value into a binary variable for gender (female: 0, male: 1).
    """
    try:
        gender = value.split(':')[1].strip().lower()
        if gender == 'male':
            return 1
        elif gender == 'female':
            return 0
    except:
        return None
    return None

# Save Metadata
save_cohort_info('GSE106800', './preprocessed/Metabolic_Rate/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Metabolic_Rate', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Metabolic_Rate/trait_data/GSE106800.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
