In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Mitochondrial_Disorders/GSE22651'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Friedreich’s Ataxia Induced Pluripotent Stem Cells Recapitulate GAA•TTC Triplet-Repeat Instability"
!Series_summary	"The inherited neurodegenerative disease Friedreich’s ataxia (FRDA) is caused by hyperexpansion of GAA•TTC trinucleotide repeats within the first intron of the FXN gene, encoding the mitochondrial protein frataxin. Long GAA•TTC repeats causes heterochromatin-mediated silencing and loss of frataxin in affected individuals. We report the derivation of induced pluripotent stem cells (iPSCs) from FRDA patient fibroblasts through retroviral transduction of transcription factors. FXN gene repression is maintained in the iPSCs, as are the mRNA and miRNA global expression signatures reflecting the human disease. GAA•TTC repeats uniquely in FXN in the iPSCs exhibit repeat instability similar to patient families, where they expand and/or contract with discrete changes in length between generations. The mismatch repair enzyme Msh2, implicated i

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# The phrase "global expression signatures" and mention of "Illumina HT12 v3 chips"
# suggest this dataset contains gene expression data.
is_gene_available = True

# 2. Variable Availability and Data Type Conversion

# Assuming a structure similar to the provided sample characteristics dictionary
sample_characteristics = {
    0: ['gender: male', 'age: 47 years', 'cell line: Human embryonic stem cell line BG01', 'cell line: Human embryonic stem cell line BG02', 'cell line: Human embryonic stem cell line BG03', 'cell line: Human induced pluripotent stem cell line ES4CL2', 'cell line: Human induced pluripotent stem cell line Gottesfeld_3816.5_1', 'cell line: Human induced pluripotent stem cell line Gottesfeld_3816.5_2', 'cell line: Human induced pluripotent stem cell line Gottesfeld_4078.1A2_1', 'cell line: Human induced pluripotent stem cell line Gottesfeld_4078.1A2_2', 'cell line: Human induced pluripotent stem cell line Gottesfeld_4078.1B3_1', 'cell line: Human induced pluripotent stem cell line Gottesfeld_4078.1B3_2', 'cell line: Human induced pluripotent stem cell line Gottesfeld_8.2A4R_1', 'cell line: Human induced pluripotent stem cell line Gottesfeld_8.2A4R_2', 'cell line: Human embryonic stem cell line H9', 'cell line: Human dermal fibroblast line HDF_A', 'cell line: Human dermal fibroblast line HDF_B', 'cell line: Human embryonic stem cell line HES-2_A', 'cell line: Human embryonic stem cell line HES-2_B', 'cell line: Human induced pluripotent stem cell line hFib2-Ips5_A', 'cell line: Human induced pluripotent stem cell line hFib2-Ips5_B', 'cell type: Human Mesenchymal_Stem_Cells_adipose HMSC-ad', 'cell type: Human Mesenchymal_Stem_Cells_bone_marrow HMSC-bm', 'cell line: Primary cell line (Human foreskin fibroblasts) HS27_A', 'cell line: Primary cell line (Human foreskin fibroblasts) HS27_B', 'cell line: Human embryonic stem cell line HSF6_A', 'cell line: Human embryonic stem cell line HSF6_B', 'cell line: Primary cell line human keratinocytes HumanKeratinocytes_A', 'cell line: Primary cell line human keratinocytes HumanKeratinocytes_B', 'cell line: Human Umbilical Vein Endothelial Cell Line  HUVEC-BF4'],
    1: ['tissue: Adipose tissue from patient 1', 'gender: female', 'tissue: Adrenal tissue from patient 1', None, 'tissue: Bladder tissue from patient 1', 'tissue: Lung tissue from Patient 1', 'tissue: Ureter tissue from Patient 1'],
    2: [None, 'tissue: Adipose tissue from patient 2', 'tissue: Adrenal tissue from patient 2', 'tissue: Bladder tissue from patient 2', 'tissue: Lung tissue from Patient 2', 'tissue: Ureter tissue from Patient 2']
}

# 'Mitochondrial_Disorders' variable assumed not explicitly mentioned.
trait_row = None  

# 'age' variable availability
for key, values in sample_characteristics.items():
    for value in values:
        if isinstance(value, str) and 'age' in value:
            age_row = key
            break
    if age_row is not None:
        break

# 'gender' variable availability
for key, values in sample_characteristics.items():
    for value in values:
        if isinstance(value, str) and 'gender' in value:
            gender_row = key
            break
    if gender_row is not None:
        break

# 2.3 Define Data Type Conversion Functions

def convert_trait(value):
    return None  # No specific details available for Mitochondrial_Disorders

def convert_age(value):
    try:
        return float(value.split(': ')[1].replace(' years', ''))
    except:
        return None

def convert_gender(value):
    try:
        gender = value.split(': ')[1].lower()
        if gender == 'male':
            return 1
        elif gender == 'female':
            return 0
        else:
            return None
    except:
        return None

# Save cohort information
save_cohort_info('GSE22651', './preprocessed/Mitochondrial_Disorders/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if applicable
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Mitochondrial_Disorders', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Mitochondrial_Disorders/trait_data/GSE22651.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
