In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Arrhythmia/GSE55231'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genome-wide identification of expression quantitative trait loci (eQTLs) in human heart: gene expression"
!Series_summary	"In recent years genome-wide association studies (GWAS) have uncovered numerous chromosomal loci associated with various electrocardiographic traits and cardiac arrhythmia predisposition. A considerable fraction of these loci lie within inter-genic regions. Trait-associated SNPs located in putative regulatory regions likely exert their effect by modulating gene expression. Hence, the key to unraveling the molecular mechanisms underlying cardiac traits is to interrogate variants for association with differential transcript abundance by expression quantitative trait locus (eQTL) analysis. In this study we conducted an eQTL analysis of human heart. To this end, left ventricular mycardium samples from non-diseased human donor hearts were hybridized to Illumina HumanOmniExpress BeadChips for genotyping (n = 129) and Illumina Human H

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determining if gene expression data is available
is_gene_available = True  # since Illumina Human HT12 Version 4 BeadChips are used for transcription profiling

# Variables and data availability
# According to the Sample Characteristics Dictionary:
# - 'Arrhythmia' is not explicitly listed, so it is not available
trait_row = None

# - 'age' is available under key 2
age_row = 2

# - 'gender' is available under key 0
gender_row = 0

# Functions for data type conversion
def extract_value(cell):
    return cell.split(': ', 1)[1] if ': ' in cell else None

def convert_trait(cell):
    return None  # No trait data available

def convert_age(cell):
    value = extract_value(cell)
    try:
        return float(value) if value else None
    except ValueError:
        return None

def convert_gender(cell):
    value = extract_value(cell)
    if value in ["male", "female"]:
        return 1 if value == "male" else 0
    else:
        return None

# Save Metadata
save_cohort_info('GSE55231', './preprocessed/Arrhythmia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Arrhythmia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Arrhythmia/trait_data/GSE55231.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
