In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Essential_Thrombocythemia/GSE42042'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Aberrant DNA methylation profiling of classic Philadelphia Negative Myeloproliferative Neoplasms"
!Series_summary	"Since most DNA methylation studies in Classic Philadelphia-negative myeloproliferative neoplasms (MPNs) – polycythaemia vera (PV), essential thrombocythaemia (ET), and primary myelofibrosis (PMF) - have been performed on a gene-by-gene basis, a more comprehensive methylation profiling is needed to know the real implication of this alteration. In order to investigate the DNA methylation profile in chronic and transformed phase MPNs, we performed genome-wide DNA methylation arrays in 71 chronic (24 PV, 23 ET and 24 PMF) and 13 transformed MPNs. The three types of chronic MPNs showed the same aberrant DNA methylation pattern when compared to controls. Differentially methylated genes (DMG) were enriched in a gene network centered on the NF-κB pathway, indicating that they may be involved in the pathogenesis of these diseases. In the case 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine gene expression data availability
is_gene_available = False  # Based on the series title and summary, it appears this is a methylation study

# Set keys for variables if available, otherwise None
trait_row = 8 if len(set(['disease state: Polycythemia Vera', 'disease state: Essential Thrombocythemia', 'disease state: Primary Myelofibrosis', 'disease state: PV Transformed to LMA', 'disease state: ET Transformed to LMA', 'disease state: PMF Transformed to LMA'])) > 1 else None
age_row = 3 if len(set(['age: 67', 'age: 72', 'age: 79', 'age: 76', 'age: 82', 'age: 85', 'age: 78', 'age: No information', 'age: 62', 'age: 54', 'age: 46', 'age: 63', 'age: 80', 'age: 68', 'age: 55', 'age: 74', 'age: 87', 'age: 64', 'age: 71', 'age: 61', 'age: 81', 'age: 50', 'age: 65', 'age: 60', 'age: 28', 'age: 84', 'age: 31', 'age: 90', 'age: 77'])) > 1 else None
gender_row = 2 if len(set(['gender: Female', 'gender: Male'])) > 1 else None

# Define conversion functions
def convert_trait(value):
    if value is None or (isinstance(value, str) and 'nan' in value.lower()):
        return None
    if isinstance(value, str):
        value = value.split(':')[1].strip()
    mapping = {
        'Essential Thrombocythemia': 1,
        'Polycythemia Vera': 0,
        'Primary Myelofibrosis': 0,
        'PV Transformed to LMA': 0,
        'ET Transformed to LMA': 1,
        'PMF Transformed to LMA': 0
    }
    return mapping.get(value, None)

def convert_age(value):
    if value is None or (isinstance(value, str) and ('nan' in value.lower() or 'no information' in value.lower())):
        return None
    try:
        if isinstance(value, str):
            return float(value.split(':')[1].strip())
        else:
            return float(value)
    except ValueError:
        return None

def convert_gender(value):
    if value is None or (isinstance(value, str) and 'nan' in value.lower()):
        return None
    if isinstance(value, str):
        value = value.split(':')[1].strip().lower()
    return 0 if value == 'female' else 1 if value == 'male' else None

# Save cohort information
save_cohort_info('GSE42042', './preprocessed/Essential_Thrombocythemia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Essential_Thrombocythemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Essential_Thrombocythemia/trait_data/GSE42042.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM785012': [nan, nan, nan], 'GSM785013': [nan, nan, nan], 'GSM785014': [nan, nan, nan], 'GSM785015': [nan, nan, nan], 'GSM785016': [nan, nan, nan], 'GSM785017': [nan, nan, nan], 'GSM785018': [nan, nan, nan], 'GSM785019': [nan, nan, nan], 'GSM1031062': [0.0, 67.0, 0.0], 'GSM1031063': [0.0, 72.0, 0.0], 'GSM1031064': [0.0, 79.0, 0.0], 'GSM1031065': [0.0, 67.0, 1.0], 'GSM1031066': [0.0, 76.0, 1.0], 'GSM1031067': [0.0, 82.0, 1.0], 'GSM1031068': [0.0, 85.0, 1.0], 'GSM1031069': [0.0, 78.0, 1.0], 'GSM1031070': [0.0, 85.0, 1.0], 'GSM1031071': [0, None, 0], 'GSM1031072': [0.0, 62.0, 1.0], 'GSM1031073': [0.0, 76.0, 0.0], 'GSM1031074': [0.0, 54.0, 1.0], 'GSM1031075': [0.0, 46.0, 1.0], 'GSM1031076': [0.0, 63.0, 1.0], 'GSM1031077': [0, None, 0], 'GSM1031078': [0.0, 80.0, 1.0], 'GSM1031079': [0.0, 68.0, 0.0], 'GSM1031080': [0.0, 55.0, 1.0], 'GSM1031081': [0, None, 1], 'GSM1031082': [0.0, 82.0, 0.0], 'GSM1031083': [0.0, 74.0, 0.0], 'GSM1031084': [0.0, 87.0, 0.0], 'GSM1031085': [0.0, 64.0, 1.0], 'GS