In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Essential_Thrombocythemia/GSE103238'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene and miRNA expression profiles in Polycythemia Vera and Essential Thrombocythemia according to CALR and JAK2 mutations [miEP]"
!Series_summary	"Polycythemia vera (PV) and essential thrombocythemia (ET) are Philadelphia-negative myeloproliferative neoplasms (MPNs) characterized by erythrocytosis and thrombocytosis, respectively. Approximately 95% of PV and 50–70% of ET patients harbour the V617F mutation in the exon 14 of JAK2 gene, while about 20-30% of ET patients carry CALRins5 or CALRdel52 mutations. These ET CARL-mutated subjects show higher platelet count and lower thrombotic risk compared to JAK2-mutated patients. Here we showed that CALR-mutated and JAK2V617F-positive CD34+ cells have different gene and miRNA expression profiles. Indeed, we highlighted several pathways differentially activated between JAK2V617F- and CALR-mutated progenitors, i.e. mTOR, MAPK/PI3K and MYC pathways. Furthermore, we unveiled that the expression of several g

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import numpy as np

# Check if gene expression data is available
is_gene_available = any("Gene expression profile" in desc for desc in [
    "!Series_title", "!Series_summary", "!Series_overall_design"
])

# Sample Characteristics Dictionary
sample_characteristics = {0: ['supplier: Vannucchi', 'supplier: Cazzola'], 1: ['Sex: M', 'Sex: F', 'Sex: not provided'], 2: ['condition: myeloproliferative neoplasm (MPN)', 'condition: Control (CTR)'], 3: ['disease: ET', 'disease: PV', 'disease: healthy control'], 4: ['jak2v617f: neg', 'jak2v617f: pos'], 5: ['mpl-mutated: neg', 'mpl-mutated: ND', 'tissue: Bone marrow'], 6: ['calr-mutated: pos', 'calr-mutated: neg', 'calr-mutated: ND', 'cell marker: CD34+'], 7: ['calr mutation: L367FS52 (tipo I)', 'calr mutation: 385insTTGTC (tipo II)', 'calr mutation: E386del AGGA', 'calr mutation: K391fs51 (tipo II)', 'calr mutation: del52 (tipo I)', 'gene mutation: V617F', np.nan], 8: ['gene mutation: CALR', 'tissue: Bone marrow', np.nan], 9: ['tissue: Bone marrow', 'cell marker: CD34+', np.nan], 10: ['cell marker: CD34+', np.nan]}

# Assign appropriate rows
for key, values in sample_characteristics.items():
    if 'disease: ET' in values:
        if trait_row is None:  # Ensure one-time assignment
            trait_row = key
    if any('Sex:' in value for value in values if isinstance(value, str)):
        if gender_row is None:  # Ensure one-time assignment
            gender_row = key

# Since age is not explicitly available, it remains None

# Define conversion functions

def convert_trait(value):
    try:
        trait = value.split(':')[-1].strip()
        return 1 if trait == 'ET' else 0
    except:
        return None

def convert_gender(value):
    try:
        gender = value.split(':')[-1].strip()
        return 1 if gender == 'M' else 0
    except:
        return None

# Age is not defined, thus convert_age is not defined

# Save cohort information
save_cohort_info('GSE103238', './preprocessed/Essential_Thrombocythemia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Essential_Thrombocythemia', trait_row, convert_trait, age_row, None, gender_row, convert_gender)
    csv_path = './preprocessed/Essential_Thrombocythemia/trait_data/GSE103238.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2758744': [1, 1], 'GSM2758745': [1, 0], 'GSM2758746': [1, 1], 'GSM2758747': [1, 1], 'GSM2758748': [1, 1], 'GSM2758749': [1, 1], 'GSM2758750': [1, 1], 'GSM2758751': [1, 0], 'GSM2758752': [1, 0], 'GSM2758753': [1, 0], 'GSM2758754': [1, 0], 'GSM2758755': [1, 1], 'GSM2758756': [1, 0], 'GSM2758757': [1, 1], 'GSM2758758': [1, 0], 'GSM2758759': [1, 1], 'GSM2758760': [1, 1], 'GSM2758761': [1, 0], 'GSM2758762': [1, 0], 'GSM2758763': [1, 0], 'GSM2758764': [1, 0], 'GSM2758765': [1, 0], 'GSM2758766': [1, 0], 'GSM2758767': [1, 1], 'GSM2758768': [0, 0], 'GSM2758769': [0, 0], 'GSM2758770': [0, 1], 'GSM2758771': [0, 1], 'GSM2758772': [0, 1], 'GSM2758773': [0, 1], 'GSM2758774': [0, 0], 'GSM2758775': [0, 1], 'GSM2758776': [0, 1], 'GSM2758777': [0, 1], 'GSM2758778': [0, 0], 'GSM2758779': [0, 1], 'GSM2758780': [0, 1], 'GSM2758781': [0, 1], 'GSM2758782': [0, 0], 'GSM2758783': [0, 1], 'GSM2758784': [0, 0], 'GSM2758785': [0, 0], 'GSM2758786': [0, 0], 'GSM2758787': [0, 0], 'GSM2758788': [0, 1], 'GSM27587