In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Eczema/GSE123086'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A validated single-cell-based strategy to identify diagnostic and therapeutic targets in complex diseases [study of 13 diseases]"
!Series_summary	"We conducted prospective clinical studies to validate the importance of CD4+ T cells in 13 diseases from the following ICD-10-CM chapters: Neoplasms (breast cancer, chronic lymphocytic leukemia); endocrine, nutritional and metabolic diseases (type I diabetes, obesity); diseases of the circulatory system (atherosclerosis); diseases of the respiratory system (acute tonsillitis, influenza, seasonal allergic rhinitis, asthma); diseases of the digestive system (Crohn’s disease [CD], ulcerative colitis [UC]); and diseases of the skin and subcutaneous tissue (atopic eczema, psoriatic diseases)."
!Series_summary	"Study participants were recruited by clinical specialists based on diagnostic criteria defined by organizations representing each specialist’s discipline. Age and gender matched healthy controls (n = 1

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if the dataset contains gene expression data
is_gene_available = True  # Based on the description of microarray-based gene expression analysis

# Identify the keys for the variables 'Eczema', 'age', and 'gender'
trait_row = 1  # 'primary diagnosis: ATOPIC_ECZEMA' is present in key 1
age_row = 3  # 'age' is present in key 3
# Use given Sample Characteristics Dictionary to determine if gender_row should be key 3 or 2
sample_dict = {
    0: ['cell type: CD4+ T cells'], 
    1: ['primary diagnosis: ATOPIC_ECZEMA'], 
    2: ['Sex: Male', 'Sex: Female'], 
    3: ['age: 56', 'Sex: Male']
}
gender_row = 3 if any('Sex' in s for s in sample_dict[3]) else 2

# Define the conversion functions
def convert_trait(value):
    """Convert eczema trait to binary."""
    if 'ATOPIC_ECZEMA' in value:
        return 1
    elif 'HEALTHY_CONTROL' in value:  # Assuming healthy control represents absence of disease
        return 0
    return None

def convert_age(value):
    """Convert age values to continuous, handle unknown values."""
    try:
        age = int(value.split(':')[1].strip())
        return age
    except Exception:
        return None

def convert_gender(value):
    """Convert gender to binary (female as 0, male as 1)."""
    gender_mapping = {'Male': 1, 'Female': 0}
    try:
        gender = value.split(':')[1].strip()
        return gender_mapping.get(gender, None)
    except Exception:
        return None

# Save metadata
save_cohort_info('GSE123086', './preprocessed/Eczema/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Eczema', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Eczema/trait_data/GSE123086.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3494884': [None, 56, None], 'GSM3494885': [None, None, 1], 'GSM3494886': [None, 20, None], 'GSM3494887': [None, 51, None], 'GSM3494888': [None, 37, None], 'GSM3494889': [None, 61, None], 'GSM3494890': [None, None, 1], 'GSM3494891': [None, 31, None], 'GSM3494892': [None, 56, None], 'GSM3494893': [None, 41, None], 'GSM3494894': [None, 61, None], 'GSM3494895': [None, None, 1], 'GSM3494896': [None, 80, None], 'GSM3494897': [None, 53, None], 'GSM3494898': [None, 61, None], 'GSM3494899': [None, 73, None], 'GSM3494900': [None, 60, None], 'GSM3494901': [None, 76, None], 'GSM3494902': [None, 77, None], 'GSM3494903': [None, 74, None], 'GSM3494904': [None, 69, None], 'GSM3494905': [None, 77, None], 'GSM3494906': [None, 81, None], 'GSM3494907': [None, 70, None], 'GSM3494908': [None, 82, None], 'GSM3494909': [None, 69, None], 'GSM3494910': [None, 82, None], 'GSM3494911': [None, 67, None], 'GSM3494912': [None, 67, None], 'GSM3494913': [None, 78, None], 'GSM3494914': [None, 67, None], 'GSM349491

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1', '2', '3', '9', '10', '12', '13', '14', '15', '16', '18', '19',
       '20', '21', '22', '23', '24', '25', '26', '27'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1', '2', '3', '9', '10'], 'ENTREZ_GENE_ID': ['1', '2', '3', '9', '10'], 'SPOT_ID': [1.0, 2.0, 3.0, 9.0, 10.0]}


### Step 6: Gene Identifier Mapping

In [7]:
if requires_gene_mapping:
    # 1. Define the keys based on gene annotation preview
    identifier_key = 'ID'
    gene_symbol_key = 'ENTREZ_GENE_ID'
    
    # 2. Get the dataframe storing the mapping between probe IDs and genes
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    
    # 3. Apply the mapping with the 'apply_gene_mapping' function from the library
    gene_data = apply_gene_mapping(gene_data, gene_mapping)

# Print the first 5 rows of the gene_data to preview the result
print(preview_df(gene_data, n=5))


{'GSM3494884': [3.34425400475, 3.0494266268, 3.182710842125, 3.207423375333333, 3.364632287642857], 'GSM3494885': [2.86592954175, 2.7821835701, 2.979813854, 3.3491639536666664, 3.2763967985714286], 'GSM3494886': [3.49059042925, 3.4020460866, 3.209781053125, 3.2482814225, 3.218083065285714], 'GSM3494887': [3.642837544, 3.2814452273, 3.6585812349999998, 4.037506882, 3.6497102672142856], 'GSM3494888': [3.39326150125, 3.4774232253, 3.0704803415, 3.435934630166667, 3.3210652113571433], 'GSM3494889': [3.6071324117500003, 2.8223040736000002, 3.017511599, 2.8975733945, 3.1559889900714286], 'GSM3494890': [3.421824487, 3.3754585474, 3.2680864455, 3.482354789166667, 3.4329731635714285], 'GSM3494891': [2.84669971075, 3.1706691374, 2.791645936125, 3.131017525166667, 3.0412276465714285], 'GSM3494892': [3.258939833, 3.7046911623, 3.28179425175, 3.6921536151666667, 3.454007694285714], 'GSM3494893': [2.97233709625, 3.0865112714, 3.100345110875, 2.968897399, 3.0093484583571426], 'GSM3494894': [3.4576681