In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Duchenne_Muscular_Dystrophy/GSE109178'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Asynchronous remodeling is a driver of failed regeneration in Duchenne muscular dystrophy"
!Series_summary	"49 human patient mRNA profiles was generated using HG-U133 Plus 2.0 microarrays. Procesed in Affymetrix Expression console using Plier normalization method and later processed in Partek Genomics Suite. The clustering figure was generated using HCE clustering software."
!Series_summary	"We sought to determine the mechanisms underlying failure of muscle regeneration that is observed in dystrophic muscle through hypothesis generation using muscle profiling data (human dystrophy and murine regeneration). We found that transforming growth factor β-centered networks strongly associated with pathological fibrosis and failed regeneration were also induced during normal regeneration but at distinct time points. We hypothesized that asynchronously regenerating microenvironments are an underlying driver of fibrosis and failed regeneration. We validated

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True  # From the background information, dataset mentions mRNA profiles, suggesting gene expression data.

# Identify the appropriate keys for each variable
trait_row = None  # No explicit variable matches Duchenne_Muscular_Dystrophy
age_row = 0  # Age information is available in column 0
gender_row = 3  # Gender information is available in column 3

# Function to convert trait values (Assuming not available as per instructions)
def convert_trait(value):
    return None

# Function to convert age values
def convert_age(value):
    try:
        age_str = value.split(':')[-1].strip()
        return None if age_str == 'NA' else float(age_str)
    except Exception:
        return None

# Function to convert gender values
def convert_gender(value):
    try:
        gender_str = value.split(':')[-1].strip().upper()
        if gender_str in ['M', 'MALE']:
            return 1
        elif gender_str in ['F', 'FEMALE']:
            return 0
        else:
            return None
    except Exception:
        return None

# Save cohort information
save_cohort_info('GSE109178', './preprocessed/Duchenne_Muscular_Dystrophy/cohort_info.json', is_gene_available, trait_row is not None)
