In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Amyotrophic_Lateral_Sclerosis/GSE118336'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"HTA2.0 (human transcriptome array) analysis of control iPSC-derived motor neurons (MN), FUS-H517D-hetero-iPSC-MN, and FUS-H517D-homo-iPSC-MNs"
!Series_summary	"To assess  RNA regulation in the MN possessing mutated FUS-H517D gene."
!Series_summary	"Fused in sarcoma/translated in liposarcoma (FUS) is a causative gene of familial amyotrophic lateral sclerosis (fALS). Mutated FUS causes accumulation of DNA damage stress and stress granule (SG) formation, etc., thereby motor neuron (MN) death. However, key molecular etiology of mutated FUS-dependent fALS (fALS-FUS) remains unclear. Here, Bayesian gene regulatory networks (GRN) calculated by Super-Computer with transcriptome data sets of induced pluripotent stem cell (iPSC)-derived MNs possessing mutated FUSH517D (FUSH517D MNs) and FUSWT identified TIMELESS, PRKDC and miR-125b-5p as ""hub genes"" which influence fALS-FUS GRNs. miR-125b-5p expression up-regulated in FUSH517D MNs, showed opposite correla

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine the availability of gene expression data
if "human transcriptome array" in "!Series_title":
    is_gene_available = True

# Sample Characteristics Dictionary for the current dataset
sample_characteristics = {
    0: ['cell type: iPSC-MN'],
    1: ['genotype: FUSWT/WT', 'genotype: FUSWT/H517D', 'genotype: FUSH517D/H517D'],
    2: ['time (differentiation from motor neuron precursor): 2 weeks', 'time (differentiation from motor neuron precursor): 4 weeks']
}

# Amyotrophic_Lateral_Sclerosis (trait) variable
if len(set(sample_characteristics[1])) > 1:
    trait_row = 1

# No info about age and gender, so these rows are not available
age_row = None
gender_row = None

# Conversion functions
def convert_trait(value):
    value = value.split(':')[-1].strip()
    if value == 'FUSWT/WT':
        return 0
    elif value == 'FUSWT/H517D' or value == 'FUSH517D/H517D':
        return 1
    else:
        return None

def convert_age(value):
    return None

def convert_gender(value):
    return None

# Save metadata
save_cohort_info('GSE118336', './preprocessed/Amyotrophic_Lateral_Sclerosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Amyotrophic_Lateral_Sclerosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Amyotrophic_Lateral_Sclerosis/trait_data/GSE118336.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3325490': [0], 'GSM3325491': [0], 'GSM3325492': [0], 'GSM3325493': [0], 'GSM3325494': [0], 'GSM3325495': [0], 'GSM3325496': [0], 'GSM3325497': [0], 'GSM3325498': [0], 'GSM3325499': [1], 'GSM3325500': [1], 'GSM3325501': [1], 'GSM3325502': [1], 'GSM3325503': [1], 'GSM3325504': [1], 'GSM3325505': [1], 'GSM3325506': [1], 'GSM3325507': [1], 'GSM3325508': [1], 'GSM3325509': [1], 'GSM3325510': [1], 'GSM3325511': [1], 'GSM3325512': [1], 'GSM3325513': [1], 'GSM3325514': [1], 'GSM3325515': [1], 'GSM3325516': [1], 'GSM3325517': [1], 'GSM3325518': [1], 'GSM3325519': [1], 'GSM3325520': [0], 'GSM3325521': [0], 'GSM3325522': [0], 'GSM3325523': [0], 'GSM3325524': [0], 'GSM3325525': [0], 'GSM3325526': [0], 'GSM3325527': [0], 'GSM3325528': [0], 'GSM3325529': [1], 'GSM3325530': [1], 'GSM3325531': [1], 'GSM3325532': [1], 'GSM3325533': [1], 'GSM3325534': [1], 'GSM3325535': [1], 'GSM3325536': [1], 'GSM3325537': [1], 'GSM3325538': [1], 'GSM3325539': [1], 'GSM3325540': [1], 'GSM3325541': [1], 'GSM3325542