In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Underweight/GSE84954'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression data from liver, muscle and fat tissue of children with end stage liver disease"
!Series_summary	"Cachexia, described as a syndrome of weight loss, muscle wasting, fat loss and insulin resistance has been described in patients with chronic liver disease. Whereas extensive work is being done to delineate these molecular pathways in adult patients with chronic liver or other disease, very little is known about these pathways in children with chronic liver disease."
!Series_summary	"We used microarrays to detail the global programme of gene expression underlying the metabolic processes of cachexia in children with end stage liver disease udergoing liver transplantion. We included tissue from patients with Crigler-najjar syndrome as controls. We were able to identify distinct classes of differentially regulated genes related to these processes."
!Series_overall_design	"9 liver,  11 muscle (rectus abdominis) and 11 subcutaneous fat tissue sa

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the gene expression data is available
# Based on the summary, it mentions the use of microarrays for gene expression.
is_gene_available = True

# Go through the Sample Characteristics Dictionary to identify keys for underweight, age, and gender
sample_characteristics = {
    0: ['subjectid: 6', 'subjectid: 8', 'subjectid: 9', 'subjectid: 10', 'subjectid: 11', 'subjectid: 12', 'subjectid: 15', 'subjectid: 1', 'subjectid: 17', 'subjectid: CN1', 'subjectid: CN2', 'subjectid: 3', 'subjectid: 4'],
    1: ['disease: Alagille', 'disease: chronic liver disease-BA', 'disease: chronic liver disease-a1AT', 'disease: chronic liver disease-BC', 'disease: chronic liver disease-NSC', 'disease: Crigler-Najjar'],
    2: ['tissue: liver', 'tissue: muscle (rectus abdominis)', 'tissue: subcutaneous fat']
}

# For underweight, assume it can be retrieved from disease status or other characteristics.
# In this case, it's not explicitly provided, so setting trait_row to None.
trait_row = None

# No age information is explicitly available in the sample characteristics provided.
age_row = None

# No gender information is available in the sample characteristics provided.
gender_row = None

def convert_trait(value):
    # Assuming trait values are binary with underweight being inferred
    try:
        trait_value = value.split(': ')[1]
        if trait_value == 'underweight':  # Change condition based on exact data indicator
            return 1
        else:
            return 0
    except:
        return None

def convert_age(value):
    try:
        age_value = value.split(': ')[1]
        return float(age_value)
    except:
        return None

def convert_gender(value):
    try:
        gender_value = value.split(': ')[1].lower()
        return 1 if gender_value == 'male' else 0 if gender_value == 'female' else None
    except:
        return None

# Save cohort info
save_cohort_info('GSE84954', './preprocessed/Underweight/cohort_info.json', is_gene_available, trait_row is not None)

# If trait_row is not None, extract and save clinical features
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Underweight', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Underweight/trait_data/GSE84954.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Underweight/cohort_info.json
