In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/LDL_Cholesterol_Levels/GSE181339'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Study of the usefulness of human peripheral blood mononuclear cells for the analysis of metabolic recovery after weight loss (METAHEALTH-TEST)"
!Series_summary	"The aim of this study is to design and validate a test, METAHEALTH-TEST, based on gene expression analysis in blood cells, to quickly and easily analyse metabolic health. This test will be used to analyse metabolic improvement in overweight/obese individuals and in metabolically obese normal-weight (MONW) individuals after undergoing a weight loss intervention and/or an intervention for improvement in eating habits and lifestyle. Obesity and its medical complications are a serious health problem today. Using peripheral blood mononuclear cells (PBMC) as an easily obtainable source of transcriptomic biomarkers would allow to deepen into the knowledge of adaptations in response to increased adiposity that occur in internal homeostatic tissues, without the need of using invasive biopsies. More

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check for gene expression data availability
# From the background information, it mentions gene expression microarray experiment.
is_gene_available = True

# Step 2.1: Data Availability
# Identify keys for 'LDL_Cholesterol_Levels', 'age', and 'gender'
# Upon inspection, there is no key in the Sample Characteristics Dictionary directly indicating LDL_Cholesterol_Levels
# However, we have keys for 'gender' and 'age'

age_row = 2
gender_row = 0
trait_row = None  # No explicit key for LDL_Cholesterol_Levels

# Step 2.3: Data Type Conversion

# Define conversion functions
def convert_age(value):
    try:
        return int(value.split(":")[1].strip())
    except (IndexError, ValueError):
        return None

def convert_gender(value):
    gender = value.split(":")[1].strip().lower()
    if gender == 'man':
        return 1
    elif gender == 'woman':
        return 0
    else:
        return None

# Since LDL_Cholesterol_Levels data is not available, we don't need converter for it.

# Step 3: Save Metadata
save_cohort_info('GSE181339', './preprocessed/LDL_Cholesterol_Levels/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction
# Since trait_row is None, skip the clinical feature extraction step.


A new JSON file was created at: ./preprocessed/LDL_Cholesterol_Levels/cohort_info.json
