In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Brugada_Syndrome/GSE136992'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"mRNA expression in SIDS"
!Series_summary	"Genetic predispositions in cases suffering sudden unexpected infant death have been a research focus worldwide the last decade. Despite large efforts there is still uncertainty concerning the molecular pathogenesis of these deaths. With genetic technology in constant development the possibility of an alternative approach into this research field have become available, like mRNA expression studies.  Methods: In this study we investigated mRNA gene expression in 14 cases that died suddenly and unexpectedly from infection without a history of severe illness prior to death. The control group included eight accidents, two cases of natural death, one undetermined, one case of medical malpractice and two homicides. The study included tissue from liver, heart and brain. The mRNA expression was determined using Illumina whole genome gene expression DASL HT assay.  Results: From the array, 19 genes showed altered ex

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine the availability of gene expression data
is_gene_available = True  # Based on the dataset description indicating mRNA gene expression data.

# Determine data availability and corresponding keys for each variable
trait_row = None  # The dataset does not directly mention Brugada Syndrome.
age_row = 2  # Age data is available in key 2.
gender_row = 3  # Gender data is available in key 3.

# Define data type conversion functions
def convert_trait(value):
    return None  # Since trait_row is None, this function shouldn't be used.

def convert_age(value):
    try:
        return float(value.split(':')[1].strip().split()[0])
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    gender_str = value.split(':')[1].strip().lower()
    if gender_str == 'male':
        return 1
    elif gender_str == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE136992', './preprocessed/Brugada_Syndrome/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(
        clinical_data,
        'Brugada_Syndrome',
        trait_row,
        convert_trait,
        age_row,
        convert_age,
        gender_row,
        convert_gender
    )
    csv_path = './preprocessed/Brugada_Syndrome/trait_data/GSE136992.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Brugada_Syndrome/cohort_info.json
