In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Sickle_Cell_Anemia/GSE11524'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"The platelet transcriptome reveals changes in arginine metabolic pathways in patients with sickle cell disease"
!Series_summary	"In sickle cell disease, ischemia-reperfusion injury and intravascular hemolysis produce endothelial dysfunction and vasculopathy characterized by reduced nitric oxide (NO) and arginine bioavailability. Recent functional studies of platelets in patients with sickle cell disease reveal a basally activated state, suggesting that pathological platelet activation may contribute to sickle cell disease vasculopathy. Studies were therefore undertaken to examine transcriptional signaling pathways in platelets that may be dysregulated in sickle cell disease. We demonstrate and validate here the feasibility of comparative platelet transcriptome studies on clinical samples from single donors, by the application of RNA amplification followed by microarray-based analysis of 54,000 probe sets. Data mining an existing microarray databas

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Define gene availability
if "platelet transcriptome" in '!Series_title' or "microarray-based analysis" in '!Series_summary':
    is_gene_available = True

# Extract sample characteristics
sample_characteristics = {
    0: ['Tissue: peripheral blood, Subject:Sickle cell patient', 'Tissue: peripheral blood, Subject:Healthy Control']
}

# Determine the keys for the variables
trait_row = 0  # 'Sickle_Cell_Anemia' can be inferred from this key
age_row = None  # age data is not available in the sample characteristics
gender_row = None  # gender data is not available in the sample characteristics

# Define conversion functions
def convert_trait(value):
    if 'Sickle cell patient' in value:
        return 1
    if 'Healthy Control' in value:
        return 0
    return None

convert_age = lambda value: None  # As age data is not available
convert_gender = lambda value: None  # As gender data is not available

# Save cohort information
save_cohort_info('GSE11524', './preprocessed/Sickle_Cell_Anemia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Sickle_Cell_Anemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Sickle_Cell_Anemia/trait_data/GSE11524.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM290396': [1], 'GSM290397': [1], 'GSM290398': [1], 'GSM290399': [1], 'GSM290400': [1], 'GSM290401': [1], 'GSM290402': [1], 'GSM290403': [1], 'GSM290404': [1], 'GSM290405': [1], 'GSM290406': [1], 'GSM290407': [1], 'GSM290408': [1], 'GSM290409': [1], 'GSM290410': [1], 'GSM290411': [1], 'GSM290412': [1], 'GSM290413': [1], 'GSM290414': [0], 'GSM290415': [0], 'GSM290416': [0], 'GSM290417': [0], 'GSM290418': [0], 'GSM290419': [0], 'GSM290420': [0], 'GSM290421': [0], 'GSM290422': [0], 'GSM290423': [0], 'GSM290424': [0], 'GSM290425': [0]}
