In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Pheochromocytoma_and_Paraganglioma/GSE64957'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Microarray study of human adrenal zona glomerulosa (ZG), zona fasciculata (ZF) and aldosterone-producing adenomas (APA)"
!Series_summary	"Learn about the transcriptome profiling of zona glomerulosa (ZG), zona fasciculata (ZF) and aldosterone-producing adenomas (APA) in human adrenals"
!Series_overall_design	"21 pairs of zona fasciculata (ZF) and zona glomerulosa (ZG), and 14 paired aldosterone-producing adenomas (APAs) from 14 Conn’s syndrome patients and 7 phaeochromocytoma patients were assayed on the Affymetrix Human Genome U133 Plus 2.0 Array. Laser capture microdissection was used to acquire samples of ZF, ZG and APA as previously described (Azizan EA, et al. J Clin Endocrinol Metab. 2012;97:E819-E829). For differentiation of ZG from ZF, sections were stained with cresyl violet using the LCM Staining Kit (AM1935, Ambion, USA). Data processing and analysis was performed using AffymetrixGeneChip Command Console Software and PartekGenomicSuite 6

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Determine if the dataset contains gene expression data
is_gene_available = True  # The description mentions the use of Affymetrix Human Genome U133 Plus 2.0 Array for transcriptome profiling

# Step 2: Check the availability of the variables and identify their respective keys

# 'Pheochromocytoma_and_Paraganglioma' availability
if 'disease: Pheochromocytoma' in [v.split(': ')[1] for v in {0: ["disease: Conn's Syndrome", 'disease: Pheochromocytoma']}[0]]:
    trait_row = 0
else:
    trait_row = None

# 'age' and 'gender' data are not explicitly noted in the sample characteristics provided
age_row = None
gender_row = None

# Step 3: Define data conversion functions
def convert_trait(value):
    val = value.split(': ')[1]
    if val == "Pheochromocytoma":
        return 1
    elif val == "Conn's Syndrome":
        return 0
    else:
        return None

def convert_age(value):
    # Assuming we have age data, the conversion logic would be here
    return None

def convert_gender(value):
    # Assuming we have gender data, the conversion logic would be here
    return None

# Step 4: Save cohort information
save_cohort_info('GSE64957', './preprocessed/Pheochromocytoma_and_Paraganglioma/cohort_info.json', is_gene_available, trait_row is not None)

# Step 5: Clinical feature extraction if trait data is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Pheochromocytoma_and_Paraganglioma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Pheochromocytoma_and_Paraganglioma/trait_data/GSE64957.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
