In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Arrhythmia/GSE34788'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genomic signatures of a global fitness index in a multi-ethnic cohort of women"
!Series_summary	"The rates of obesity and sedentary lifestyle are on a dramatic incline, with associated detrimental health effects among women in particular. Although exercise prescriptions are useful for overcoming these problems, success can be hampered by differential responsiveness among individuals in cardiovascular fitness indices (i.e., improvements in strength, lipids, VO2max). Genetic factors appear to play an important role in determining this inter-individual variation in responsiveness.  We performed microarray analyses on mRNA in whole blood from 60 sedentary women from a multi-ethnic cohort who underwent 12 weeks of exercise, to identify gene subsets that were differentially expressed between individuals who experienced the greatest and least improvements in fitness based upon a composite fitness score index. We identified 43 transcripts in 39 unique gen

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
is_gene_available = True  # Based on the series summary, we infer this is gene expression data (mRNA)

# Extract indices for the required variables
# For the trait 'Arrhythmia', there doesn't seem to be a direct indication; closest we have is 'heart rate' or 'composite score', none mention arrhythmia specifically
trait_row = None  

# For 'age', data is not available from the listed dictionary
age_row = None  

# For 'gender', it is available under key 1 and 'female' is noted
gender_row = 1

# Define conversion functions
import re 

def extract_value(cell):
    match = re.search(r':\s*(\S+)', cell)
    return match.group(1) if match else None 

def convert_trait(value):
    return None  # No conversion applied as trait is not available

def convert_age(value):
    return None  # No conversion applied as age data is not available

def convert_gender(value):
    true_value = extract_value(value).lower()
    if true_value == 'female':
        return 0
    elif true_value == 'male':
        return 1
    else:
        return None

# Save cohort information
save_cohort_info('GSE34788', './preprocessed/Arrhythmia/cohort_info.json', is_gene_available, trait_row is not None)
