In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Heart_rate/GSE117070'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"The Heritage family study - skeletal muscle gene expression"
!Series_summary	"Gene expression profiles generated from skeletal muscle biopsies taken from participants of the HERITAGE family study. Participants completed an endurance training regime in which a skeletal muscle biopsy was taken prior to the start and after the final session of the program. Biopsies were used to generate Affymetrix gene expression microarrays."
!Series_overall_design	"The experimental design and exercise training protocol of the HERITAGE Family Study have been described previously (Bouchard et al., 1995). Participants were sedentary at baseline and normotensive. Each participant exercised three times per week for 20 weeks on cycle ergometers controlled by direct heart rate (HR) monitoring. Muscle biopsies of vastus lateralis were obtained at baseline and post-training."
Sample Characteristics Dictionary:
{0: ['status: pre-training', 'status: post-training']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Checking if the dataset contains gene expression data based on the background information
if 'Affymetrix gene expression microarrays' in "!Series_summary\t\"Gene expression profiles generated from skeletal muscle biopsies taken from participants of the HERITAGE family study. Participants completed an endurance training regime in which a skeletal muscle biopsy was taken prior to the start and after the final session of the program. Biopsies were used to generate Affymetrix gene expression microarrays.\"":
    is_gene_available = True

# In this case, sample characteristics should contain 'Heart_rate', 'age', and 'gender', which are not explicitly mentioned.
# The dataset only contains `status: pre-training` and `status: post-training`, so we can't determine the availability of trait, age, and gender variables.

# Therefore, trait_row, age_row, and gender_row remain None as their corresponding data are not available.
trait_row = None
age_row = None
gender_row = None

# Define conversion functions, though they won't be used since data rows are not identified
def convert_trait(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

def convert_age(value):
    try:
        return int(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    gender_str = value.split(':')[1].strip().lower()
    if gender_str == 'female':
        return 0
    elif gender_str == 'male':
        return 1
    else:
        return None

# Save metadata
save_cohort_info('GSE117070', './preprocessed/Heart_rate/cohort_info.json', is_gene_available, trait_row is not None)


A new JSON file was created at: ./preprocessed/Heart_rate/cohort_info.json
