In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Atherosclerosis/GSE154851'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Investigation Of Genes Associated With Atherosclerosis In Patients With Systemic Lupus Erythematosus"
!Series_summary	"Systemic lupus erythematosus (SLE) is a chronic, autoimmune disease affecting multiple heterogeneous organs and systems. SLE is associated with increased risk of atherosclerosis and increased cardiovascular complications. In this study, we specifically aimed to identify patients with SLE who are genetically at risk for developing atherosclerosis. Sureprint G3 Human Gene Expression 8x60K Microarray kit (Agilent technologies, Santa Clara, CA, USA) was used in our study. Genes showing differences in expression between the groups were identified by using GeneSpring GX 10.0 program. A total of 155 genes showing expression level difference were detected between SLE patients and healthy controls. In molecular network analysis."
!Series_overall_design	"38 patients with systemic lupus erythematosus (36 females, 2 males) and 32 healthy cont

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check gene expression data availability
is_gene_available = True  # Dataset uses a Sureprint G3 Human Gene Expression 8x60K Microarray kit

# Variable Availability and Data Type Conversion
# Atherosclerosis data availability cannot be inferred explicitly from the given sample data
trait_row = None

# Check for 'age' data availability
age_row = 2  # Age data is available under key 2

# Check for 'gender' data availability
gender_row = 1  # Gender data is available under key 1

# Define conversion functions
def convert_trait(value):
    # This function remains unused in this step as trait_row is None.
    return None

def convert_age(value):
    try:
        return float(value.split(': ')[1][:-1])  # Strip 'y' character and convert to float
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    gender = value.split(': ')[1].strip().lower()
    if gender == 'female':
        return 0
    elif gender == 'male':
        return 1
    else:
        return None

# Save Metadata
save_cohort_info('GSE154851', './preprocessed/Atherosclerosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Atherosclerosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Atherosclerosis/trait_data/GSE154851.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
