In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Heart_rate/GSE35661'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A transcriptional map of the impact of endurance exercise training on skeletal muscle phenotype"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series. The P-odd number samples are baseline pre-training, while the P-even numbers are baseline post supervised endurance exercise training. The arrays were produced as part of a collaboration written and established by J Timmons (PI) between Pfizer UK LTD and Karolinska in 1998. Original array profiles were produced in 2001 (U95 arrays) on a subset of 'responders' and the informatics analysis carried out by Ola Larsson and J Timmons in 2004. The full cohort was re-profiled by J Timmons in 2006 using U133+2 arrays (as deposited here). "
Sample Characteristics Dictionary:
{0: ['protocol: Resting skeletal muscle sample after to endurance training'], 1: ['gender: male'], 2: ['heart rate (bpm): 151', 'heart rate (bpm): 123', 'heart ra

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import numpy as np  # Import numpy to use np.nan

# Check if gene expression data is likely present
if '!Series_title' in ['"A transcriptional map of the impact of endurance exercise training on skeletal muscle phenotype"']:
    is_gene_available = True

# Identify rows for 'Heart_rate', 'age', and 'gender' and their conversion functions
try:
    sample_characteristics = {
        0: ['gender: male'],
        1: ['protocol: Resting skeletal muscle sample prior to endurance training'],
        2: ['heart rate (bpm): 173', 'heart rate (bpm): 155', 'heart rate (bpm): 183', 'heart rate (bpm): 149', 'heart rate (bpm): 146', 'heart rate (bpm): 157', 'heart rate (bpm): 162', 'heart rate (bpm): 170', 'heart rate (bpm): 165', 'heart rate (bpm): 144', 'heart rate (bpm): 167', 'heart rate (bpm): 191', 'heart rate (bpm): 160', 'heart rate (bpm): 177', 'heart rate (bpm): 174', 'heart rate (bpm): 190', 'heart rate (bpm): 169', np.nan],
        3: ['age: 25', 'age: 30', 'age: 28', 'age: 35'],  # Verify actual key-value pairs for age in the dataset
        # Additional fields...
    }

    if sample_characteristics.get(2) and len(sample_characteristics[2]) > 1:
        trait_row = 2

    if sample_characteristics.get(0) and len(sample_characteristics[0]) > 1:
        gender_row = 0

    if sample_characteristics.get(3) and len(sample_characteristics[3]) > 1:  # Verify key for age
        age_row = 3

    # Define conversion functions
    def convert_trait(value):
        try:
            if ':' in value:
                return float(value.split(':')[1].strip())
            return None
        except:
            return None

    def convert_gender(value):
        try:
            if ':' in value:
                gender_value = value.split(':')[1].strip().lower()
                return 1 if gender_value == 'male' else 0 if gender_value == 'female' else None
            return None
        except:
            return None

    def convert_age(value):
        try:
            if ':' in value:
                return float(value.split(':')[1].strip())
            return None
        except:
            return None

except KeyError:
    pass

# Save cohort info
save_cohort_info('GSE35661', './preprocessed/Heart_rate/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(
        clinical_data, 'Heart_rate', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender
    )
    csv_path = './preprocessed/Heart_rate/trait_data/GSE35661.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM873144': [151.0, 2.68], 'GSM873145': [123.0, 1.82], 'GSM873146': [156.0, 2.88], 'GSM873147': [137.0, 2.93], 'GSM873148': [135.0, 2.99], 'GSM873149': [155.0, 2.6], 'GSM873150': [130.0, 3.17], 'GSM873151': [163.0, 2.63], 'GSM873152': [160.0, 1.9], 'GSM873153': [128.0, 2.38], 'GSM873154': [131.0, 2.35], 'GSM873155': [146.0, 2.88], 'GSM873156': [163.0, 2.42], 'GSM873157': [134.0, 2.56], 'GSM873158': [151.0, 2.96], 'GSM873159': [158.0, 2.21], 'GSM873160': [162.0, 2.407], 'GSM873161': [150.0, 1.99], 'GSM873162': [165.0, 2.46], 'GSM873163': [182.0, 3.2], 'GSM873164': [168.0, 2.2], 'GSM873165': [151.0, 3.22], 'GSM873166': [150.0, 2.71], 'GSM873167': [165.0, 2.05]}
