In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Multiple_sclerosis/GSE215450'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Circulating microRNAs in relapsing MS patients treated with dimethyl fumarate in the phase 4 TREMEND trial"
!Series_summary	"Dimethyl fumarate (DMF) is an oral drug approved for relapsing multiple sclerosis (MS) that leads to reduction of neurofilament light (NFL).  This may be related to dynamics and persistence of microRNA signatures in the peripheral blood of treatment-naïve MS patients before and after dimethyl fumarate (DMF) at different time points. 210 blood samples were collected from 51 treatment-naïve patients at baseline (BL) and after 1-3, 4-7, 9-15 and 21-27 months of DMF and from 22 controls from the phase IV TREMEND trial. Using microarray, 1,085 miRNAs were two-folds above the background and compared versus NFL. Altered miRNA profiles peaked after 4-7 months. MiR-16-5p and miR-4306, involved in the NF-kB-pathway, were upregulated in low NFL samples, while miR-940 and miR-4665-3p were upregulated in high NFL samples. NFL and miRNA c

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Determine if gene expression data is available
# Given the provided information is about miRNA data, gene expression data is not available
is_gene_available = False

# Step 2: Variable Availability and Data Type Conversion

# 'Multiple_sclerosis' trait
# Based on the dataset description, 'Multiple_sclerosis' information is implicitly available through the "timepoint" key
trait_row = 1  # the timepoint key provides information on disease status

def convert_trait(value):
    """
    Convert timepoint information to binary trait: 
    Control (0), and any other group (Baseline, Early, Intermediate, etc.) to MS (1).
    """
    try:
        value = value.split(':')[1].strip()
        return 0 if value == "Control" else 1
    except Exception:
        return None

# 'age' variable
# Age information is explicitly available in the key indexed by 2
age_row = 2

def convert_age(value):
    """
    Convert the age value to a continuous data type.
    """
    try:
        return int(value.split(':')[1].strip())
    except ValueError:
        return None

# 'gender' variable
# Based on the dataset description, gender information is explicitly available in the key indexed by 3
gender_row = 3

def convert_gender(value):
    """
    Convert the gender information to binary: female (0), male (1).
    """
    try:
        value = value.split(':')[1].strip().lower()
        if value == "female":
            return 0
        elif value == "male":
            return 1
        else:
            return None
    except Exception:
        return None
        
# Step 3: Save Metadata
save_cohort_info('GSE215450', './preprocessed/Multiple_sclerosis/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Multiple_sclerosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Multiple_sclerosis/trait_data/GSE215450.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6668434': [0, 51, 0], 'GSM6668435': [0, 29, 0], 'GSM6668436': [0, 35, 0], 'GSM6668437': [0, 31, 1], 'GSM6668438': [0, 43, 0], 'GSM6668439': [0, 30, 1], 'GSM6668440': [0, 22, 0], 'GSM6668441': [0, 30, 0], 'GSM6668442': [0, 23, 0], 'GSM6668443': [0, 40, 0], 'GSM6668444': [0, 27, 0], 'GSM6668445': [0, 44, 0], 'GSM6668446': [0, 31, 0], 'GSM6668447': [0, 27, 1], 'GSM6668448': [0, 35, 0], 'GSM6668449': [0, 49, 0], 'GSM6668450': [0, 41, 0], 'GSM6668451': [0, 46, 1], 'GSM6668452': [0, 48, 0], 'GSM6668453': [0, 48, 1], 'GSM6668454': [0, 19, 1], 'GSM6668455': [0, 40, 1], 'GSM6668456': [1, 43, 1], 'GSM6668457': [1, 43, 1], 'GSM6668458': [1, 43, 1], 'GSM6668459': [1, 20, 0], 'GSM6668460': [1, 20, 0], 'GSM6668461': [1, 30, 0], 'GSM6668462': [1, 30, 0], 'GSM6668463': [1, 24, 0], 'GSM6668464': [1, 24, 0], 'GSM6668465': [1, 24, 0], 'GSM6668466': [1, 24, 0], 'GSM6668467': [1, 31, 0], 'GSM6668468': [1, 31, 0], 'GSM6668469': [1, 31, 0], 'GSM6668470': [1, 49, 0], 'GSM6668471': [1, 49, 0], 'GSM6668472