In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Essential_Thrombocythemia/GSE11003'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling distinguishes JAK2V617F-negative from JAK2V617F-positive patients in essential thrombocythemia"
!Series_summary	"In order to explore the gene expression signature in essential thrombocythemia (ET) patients in relation to JAK2V617F mutational status, expression profiling in circulating granulocytes was performed. Twenty ET were studied by microarray analysis and the results were confirmed by real-time quantitative RT-PCR in 40 ET patients, not receiving cytoreductive treatment. A heterogeneous molecular signature characterized by two main gene expression patterns was found: one with an up-regulation of inflammatory genes related to neutrophil activation and thrombosis, and the other one with significantly lower expression of these genes. Supervised clustering analysis showed 30 genes differentially expressed between JAK2V617F-negative and JAK2V617F-positive ET patients. Among the JAK2V617F-negative, a set of 14 genes (CISH

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
is_gene_available = True

# Check for the availability and determine the keys for essential variables
essential_thrombocythemia_key = None  # Essential_Thrombocythemia variable is not directly available
age_key = 0  # Age data is available
gender_key = 0  # Gender data is available

# Assign appropriate keys
trait_row = essential_thrombocythemia_key
age_row = age_key
gender_row = gender_key

# Define conversion functions
def extract_value(cell):
    try:
        return cell.split(': ')[1]
    except IndexError:
        return None

def convert_trait(value):
    return None  # As we have no explicit mapping 

def convert_age(cell):
    value = extract_value(cell)
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def convert_gender(cell):
    value = extract_value(cell)
    if value.lower() == 'female':
        return 0
    elif value.lower() == 'male':
        return 1
    else:
        return None

# Save cohort information
save_cohort_info('GSE11003', './preprocessed/Essential_Thrombocythemia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Essential_Thrombocythemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Essential_Thrombocythemia/trait_data/GSE11003.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
