In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Essential_Thrombocythemia/GSE103176'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene and miRNA expression profiles in Polycythemia Vera and Essential Thrombocythemia according to CALR and JAK2 mutations"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_summary	""
!Series_overall_design	"Refer to individual series"
!Series_overall_design	""
Sample Characteristics Dictionary:
{0: ['supplier: Vannucchi', 'supplier: Cazzola'], 1: ['Sex: M', 'Sex: F', 'Sex: not provided'], 2: ['condition: myeloproliferative neoplasm (MPN)', 'condition: Control (CTR)'], 3: ['disease: ET', 'disease: PV', 'disease: healthy control'], 4: ['jak2v617f: neg', 'jak2v617f: pos'], 5: ['mpl-mutated: neg', 'mpl-mutated: ND', 'tissue: Bone marrow'], 6: ['calr-mutated: pos', 'calr-mutated: neg', 'calr-mutated: ND', 'cell marker: CD34+'], 7: ['calr mutation: L367FS52 (tipo I)', 'calr mutation: 385insTTGTC (tipo II)', 'calr mutation: E386del AGGA', 'calr mutation: K391fs51 (tipo II)', 'calr mutation: del52 (tipo I)', 'gene mut

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import os

# Check if gene expression data is available
if isinstance(background_info, dict) and '!Series_title' in background_info and 'Gene and miRNA expression' in background_info['!Series_title']:
    is_gene_available = True

# Correct Identification of Keys based on provided dictionary
trait_row = 3
age_row = None  # assuming no age data available
gender_row = 1

# Define data type conversion functions
def convert_trait(value):
    value = value.split(':')[1].strip().lower()
    if value == 'et':
        return 1
    if value in ['pv', 'healthy control']:
        return 0
    return None

def convert_age(value):
    try:
        return int(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    value = value.split(':')[1].strip().upper()
    return 1 if value == 'M' else 0 if value == 'F' else None

# Save cohort information
save_cohort_info('GSE103176', './preprocessed/Essential_Thrombocythemia/cohort_info.json', is_gene_available, trait_row is not None)

# Extract clinical features if trait information is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Essential_Thrombocythemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_dir = './preprocessed/Essential_Thrombocythemia/trait_data'
    os.makedirs(csv_dir, exist_ok=True)
    csv_path = os.path.join(csv_dir, 'GSE103176.csv')
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2758679': [1, 1], 'GSM2758680': [1, 0], 'GSM2758681': [1, 1], 'GSM2758682': [1, 1], 'GSM2758683': [1, 1], 'GSM2758684': [1, 1], 'GSM2758685': [1, 1], 'GSM2758686': [1, 0], 'GSM2758687': [1, 0], 'GSM2758688': [1, 0], 'GSM2758689': [1, 0], 'GSM2758690': [1, 1], 'GSM2758691': [1, 0], 'GSM2758692': [1, 1], 'GSM2758693': [1, 0], 'GSM2758694': [1, 1], 'GSM2758695': [1, 1], 'GSM2758696': [1, 0], 'GSM2758697': [1, 0], 'GSM2758698': [1, 0], 'GSM2758699': [1, 0], 'GSM2758700': [1, 0], 'GSM2758701': [1, 0], 'GSM2758702': [1, 1], 'GSM2758703': [0, 0], 'GSM2758704': [0, 0], 'GSM2758705': [0, 1], 'GSM2758706': [0, 1], 'GSM2758707': [0, 1], 'GSM2758708': [0, 1], 'GSM2758709': [0, 0], 'GSM2758710': [0, 1], 'GSM2758711': [0, 1], 'GSM2758712': [0, 1], 'GSM2758713': [0, 0], 'GSM2758714': [0, 1], 'GSM2758715': [0, 1], 'GSM2758716': [0, 1], 'GSM2758717': [0, 0], 'GSM2758718': [0, 1], 'GSM2758719': [0, 0], 'GSM2758720': [0, 0], 'GSM2758721': [0, 0], 'GSM2758722': [0, 0], 'GSM2758723': [0, 1], 'GSM27587