In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Allergies/GSE162926'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"MicroRNA expression data from serum samples of adult atopic dermatitis and psoriasis patients and healthy controls"
!Series_summary	"Atopic dermatitis (AD) and psoriasis vulgaris (PV) affect up to 3-5% of adults in developed countries and severely impair their quality of life thorough adulthood. MicroRNAs contribute to either the development or regulation of several diseases and are present in body fluids, such as serum or plasma, where their identification is of remarkable value as minimally invasive circulating markers of disease."
!Series_summary	"Although several miRNAs have been associated with AD, to our knowledge, no serum miRNA profiling of adult European AD patients has been published to date and no comparison of AD and PV has been performed."
!Series_overall_design	"We conducted a miRNA profiling analysis of serum samples from adult AD and PV patients and control individuals."
Sample Characteristics Dictionary:
{0: ['subject status: atop

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if the dataset likely contains gene expression data
is_gene_available = False  # since the dataset focuses on miRNA data

# Sample Characteristics Dictionary from STEP 1 output
sample_characteristics_dict = {
    0: ['subject status: atopic dermatitis adult patient', 'subject status: healthy control adult', 'subject status: psoriasis adult patient'],
    1: ['tissue: serum']
}

# Assigning the keys to the appropriate variables if the data is available
trait_row = 0  # since this row contains different subject statuses like 'atopic dermatitis', 'healthy control', and 'psoriasis'
age_row = None  # no age data available
gender_row = None  # no gender data available

# Function to convert trait values
def convert_trait(value):
    value = value.split(':')[1].strip().lower()
    if value == 'atopic dermatitis adult patient':
        return 1
    elif value == 'healthy control adult':
        return 0
    elif value == 'psoriasis adult patient':
        return None  # Assuming we are only interested in allergies vs healthy control
    return None

# Since age and gender are not available, no need to define convert_age and convert_gender functions

# Save the cohort information
save_cohort_info('GSE162926', './preprocessed/Allergies/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Allergies', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Allergies/trait_data/GSE162926.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4966311': [1], 'GSM4966312': [1], 'GSM4966313': [1], 'GSM4966314': [1], 'GSM4966315': [1], 'GSM4966316': [1], 'GSM4966317': [1], 'GSM4966318': [1], 'GSM4966319': [1], 'GSM4966320': [1], 'GSM4966321': [1], 'GSM4966322': [1], 'GSM4966323': [1], 'GSM4966324': [1], 'GSM4966325': [1], 'GSM4966326': [1], 'GSM4966327': [0], 'GSM4966328': [0], 'GSM4966329': [0], 'GSM4966330': [0], 'GSM4966331': [0], 'GSM4966332': [0], 'GSM4966333': [0], 'GSM4966334': [0], 'GSM4966335': [0], 'GSM4966336': [0], 'GSM4966337': [0], 'GSM4966338': [0], 'GSM4966339': [0], 'GSM4966340': [0], 'GSM4966341': [0], 'GSM4966342': [0], 'GSM4966343': [0], 'GSM4966344': [None], 'GSM4966345': [None], 'GSM4966346': [None], 'GSM4966347': [None], 'GSM4966348': [None], 'GSM4966349': [None], 'GSM4966350': [None], 'GSM4966351': [None], 'GSM4966352': [None], 'GSM4966353': [None], 'GSM4966354': [None], 'GSM4966355': [None], 'GSM4966356': [None], 'GSM4966357': [None], 'GSM4966358': [None], 'GSM4966359': [None]}
