In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Physical_Exercise_Response/GSE148152'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Muscle transcriptome response to 84-day bed rest with and without resistance exercise in men: the search for the residual signature of muscle atrophy"
!Series_summary	"The present study concerns the effects of 90 days of sustained recumbence (bedrest) on skeletal muscle gene expression with and without exercise countermeasures."
!Series_overall_design	"Twenty-one healthy men (age range 26-41 yr) were randomized to performed 90-d bed rest with (BRE; n=9) or without (BR; n=12) concurrent iso-inertial resistance exercise targeting the quadriceps muscle group (i.e. supine squat; 4 sets of 7 maximal concentric-eccentric repetitions every third day) employing flywheel technology. Muscle biopsies from m. vastus lateralis were obtained from all subjects before and after 84 days of bed rest."
Sample Characteristics Dictionary:
{0: ['tissue: Vastus lateralis'], 1: ['gender: Male'], 2: ['subject: A1', 'subject: B1', 'subject: C1', 'subject: D1', 'subject: E1

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # Since the study focuses on "skeletal muscle gene expression"

# Determine the availability of variables
sample_characteristics = {
    0: ['tissue: Vastus lateralis'],
    1: ['gender: Male'],
    2: ['subject: A1', 'subject: B1', 'subject: C1', 'subject: D1', 'subject: E1', 'subject: F1', 'subject: G1', 'subject: H1', 'subject: I1', 'subject: J1', 'subject: A2', 'subject: B2', 'subject: C2', 'subject: D2', 'subject: E2', 'subject: F2', 'subject: G2', 'subject: H2', 'subject: I2', 'subject: J2', 'subject: K2'],
    3: ['group: Bed rest + exercise', 'group: Bed rest only'],
    4: ['time: Pre', 'time: Post']
}

# Determining rows for the variables
if len(set(sample_characteristics[1])) > 1:
    gender_row = 1
if len(set(sample_characteristics[3])) > 1:
    trait_row = 3

# Define conversion functions
def convert_trait(value):
    try:
        val = value.split(": ")[1]
        if "exercise" in val.lower():
            return 1
        elif "bed rest only" in val.lower():
            return 0
    except (IndexError, AttributeError):
        return None

def convert_age(value):
    # Age data is not available in the dictionary
    return None

def convert_gender(value):
    try:
        val = value.split(": ")[1]
        if val.lower() == 'male':
            return 1
        elif val.lower() == 'female':
            return 0
    except (IndexError, AttributeError):
        return None

# Save cohort information
save_cohort_info('GSE148152', './preprocessed/Physical_Exercise_Response/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Physical_Exercise_Response', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Physical_Exercise_Response/trait_data/GSE148152.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4455096': [1], 'GSM4455097': [1], 'GSM4455098': [1], 'GSM4455099': [1], 'GSM4455100': [1], 'GSM4455101': [1], 'GSM4455102': [1], 'GSM4455103': [1], 'GSM4455104': [0], 'GSM4455105': [0], 'GSM4455106': [0], 'GSM4455107': [0], 'GSM4455108': [0], 'GSM4455109': [0], 'GSM4455110': [0], 'GSM4455111': [0], 'GSM4455112': [0], 'GSM4455113': [0], 'GSM4455114': [0], 'GSM4455115': [0], 'GSM4455116': [0], 'GSM4455117': [0], 'GSM4455118': [0], 'GSM4455119': [0], 'GSM4455120': [0], 'GSM4455121': [0], 'GSM4455122': [1], 'GSM4455123': [1], 'GSM4455124': [1], 'GSM4455125': [1], 'GSM4455126': [1], 'GSM4455127': [0], 'GSM4455128': [0], 'GSM4455129': [0], 'GSM4455130': [0], 'GSM4455131': [0], 'GSM4455132': [1], 'GSM4455133': [1], 'GSM4455134': [1], 'GSM4455135': [1], 'GSM4455136': [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st',
       '2827995_st', '2827996_st', '2828010_st', '2828012_st', '2835442_st',
       '2835447_st', '2835453_st', '2835456_st', '2835459_st', '2835461_st',
       '2839509_st', '2839511_st', '2839513_st', '2839515_st', '2839517_st'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['11869', '29554', '69091', '160446', '317811'], 'stop': ['14409', '31109', '70008', '161525', '328581'], 'total_probes': [49.0, 60.0, 30.0, 30.0, 191.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST0000040838

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function from the library
gene_mapping_df = get_gene_mapping(gene_annotation, prob_col=identifier_key, gene_col=gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library, and name the resulting gene expression dataframe "gene_data"
gene_data = apply_gene_mapping(gene_data, gene_mapping_df)

print(preview_df(gene_data))


{'GSM4455096': [6.063022703273496, 6.621034294053265, 3.2, 5.32, 3.92], 'GSM4455097': [6.06522439281943, 6.620189103733431, 3.03, 4.98, 3.66], 'GSM4455098': [6.075287750791975, 6.650562446795573, 3.05, 5.48, 3.74], 'GSM4455099': [6.06311510031679, 6.664753739511127, 3.17, 5.82, 3.58], 'GSM4455100': [6.043619324181626, 6.616556609509911, 3.24, 5.41, 3.31], 'GSM4455101': [6.059743928194298, 6.639858324212574, 2.99, 5.29, 4.19], 'GSM4455102': [6.06941393875396, 6.619971421622279, 3.27, 5.09, 3.87], 'GSM4455103': [6.068651003167899, 6.6219640034050835, 3.19, 5.37, 3.73], 'GSM4455104': [6.088423970432946, 6.649995743645872, 3.06, 5.03, 3.76], 'GSM4455105': [6.070681098204858, 6.640168430013378, 3.26, 4.74, 3.68], 'GSM4455106': [6.07797782470961, 6.632207831691597, 2.99, 5.45, 3.6], 'GSM4455107': [6.06596620908131, 6.651308524869269, 3.15, 5.21, 3.57], 'GSM4455108': [6.06363252375924, 6.60067067980056, 3.2, 4.87, 3.92], 'GSM4455109': [6.086346356916578, 6.617088653775994, 3.09, 4.83, 3.63], 