In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Physical_Exercise_Response/GSE133910'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Systematic assessment of blood-borne microRNAs highlights molecular profiles of endurance sport and carbohydrate uptake"
!Series_summary	"Multiple studies endorsed the positive effect of regular exercising on mental and physical health. However, the molecular mechanisms underlying training-induced fitness in combination with personal life-style remain largely unexplored. Circulating biomarkers such as microRNAs (miRNAs) offer themselves for studying systemic and cellular changes since they can be collected from the bloodstream in a low-invasive manner. In Homo sapiens miRNAs are known to regulate a substantial number of protein-coding genes in a post-transcriptional manner and hence are of great interest to understand differential gene expression profiles, offering a cost-effective mechanism to study molecular training adaption, and connecting the dots from genomics to observed phenotypes."
!Series_summary	"Here, we investigated molecular expressi

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
# Based on the background information, the data is miRNA, so it does not contain gene expression data
is_gene_available = False

# Determine availability of each variable
# 'Physical_Exercise_Response': inferred from 'timepoint' and/or 'cho-first-round(y/n)'
# 'age': certain, key is 2
# 'gender': certain, key is 1

# Check sample characteristics for relevant keys
trait_row = 4  # timepoint information might infer 'Physical_Exercise_Response'
age_row = 2
gender_row = 1

# Define conversion functions based on the data type chosen.

def convert_trait(value):
    # Convert 'timepoint' values to binary, 'E1' and 'E2' = 0, 'A1' and 'A2' = 1
    try:
        trait_value = value.split(': ')[1]
        if trait_value in ['E1', 'E2']:
            return 0
        elif trait_value in ['A1', 'A2']:
            return 1
    except:
        return None

def convert_age(value):
    # Convert 'age' to continuous
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    # Convert 'gender' to binary, 'female' = 0, 'male' = 1
    try:
        gender_value = value.split(': ')[1]
        if gender_value == 'female':
            return 0
        elif gender_value == 'male':
            return 1
    except:
        return None

# Save cohort information
save_cohort_info('GSE133910', './preprocessed/Physical_Exercise_Response/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
selected_clinical_data = geo_select_clinical_features(clinical_data, 'Physical_Exercise_Response', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Physical_Exercise_Response/trait_data/GSE133910.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM3929584': [0.0, 47.6, 0.0], 'GSM3929585': [1.0, 47.79, 0.0], 'GSM3929586': [0.0, 47.95, 0.0], 'GSM3929587': [1.0, 48.2, 0.0], 'GSM3929588': [0.0, 30.15, 1.0], 'GSM3929589': [1.0, 30.34, 1.0], 'GSM3929590': [0.0, 30.49, 1.0], 'GSM3929591': [1.0, 30.77, 1.0], 'GSM3929592': [0.0, 53.73, 0.0], 'GSM3929593': [1.0, 53.95, 0.0], 'GSM3929594': [0.0, 54.13, 0.0], 'GSM3929595': [1.0, 54.34, 0.0], 'GSM3929596': [0.0, 47.24, 0.0], 'GSM3929597': [1.0, 47.44, 0.0], 'GSM3929598': [0.0, 47.71, 0.0], 'GSM3929599': [1.0, 47.96, 0.0], 'GSM3929600': [0.0, 43.11, 1.0], 'GSM3929601': [1.0, 43.33, 1.0], 'GSM3929602': [0.0, 43.58, 1.0], 'GSM3929603': [1.0, 43.88, 1.0], 'GSM3929604': [0.0, 45.64, 1.0], 'GSM3929605': [1.0, 45.83, 1.0], 'GSM3929606': [0.0, 46.01, 1.0], 'GSM3929607': [0.0, 36.18, 1.0], 'GSM3929608': [1.0, 36.43, 1.0], 'GSM3929609': [0.0, 36.62, 1.0], 'GSM3929610': [1.0, 36.87, 1.0], 'GSM3929611': [0.0, 56.96, 1.0], 'GSM3929612': [1.0, 57.15, 1.0], 'GSM3929613': [0.0, 57.36, 1.0], 'GSM3929614