In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Pheochromocytoma_and_Paraganglioma/GSE39716'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression data from pheochromocytoma (PHEO) and paraganglioma (PGL) tumor samples"
!Series_summary	"Genotype specific differences in expression profiles have been evaluated using human HuGene1.0-ST Gene Chips. In this dataset we include expression data obtained from 8 normal adrenal medulla and 45 PHEOs/PGLs patient samples."
!Series_summary	"Viable appearing tissue from the center of the lesions was collected and snap frozen for RNA extraction. Each of the 45 PHEO/PGL samples was examined by pathologist upon resection. Patients PKh_27 and PKh_28 with SDHB mutation were from the same patient with samples taken from two different locations at different times. Diagnosis of PHEO/PGL has been confired in all cases histopathologically. The tissues were grouped according to genetic/syndromic background and tumor location into SDHB (n = 18), SDHD-A/T (n = 6), SDHD-HN (n= 8), and VHL (n = 13)."
!Series_overall_design	"Microarray analysis was performed on

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine gene availability
is_gene_available = True

# Sample Characteristics Dictionary for this dataset
sample_characteristics = {
    0: ['age(yrs): 61', 'age(yrs): --', 'age(yrs): 53', 'age(yrs): 72', 'age(yrs): 65', 'age(yrs): 56', 'age(yrs): 62', 'age(yrs): 28.5', 'age(yrs): 24', 'age(yrs): 46', 'age(yrs): 9', 'age(yrs): 22.8', 'age(yrs): 26.6', 'age(yrs): 38.2', 'age(yrs): 45.6', 'age(yrs): 36', 'age(yrs): 36.2', 'age(yrs): 52', 'age(yrs): 12', 'age(yrs): 31', 'age(yrs): 55', 'age(yrs): 35', 'age(yrs): 34.8', 'age(yrs): 17', 'age(yrs): 30', 'age(yrs): 16', 'age(yrs): 61.4', 'age(yrs): 31.8', 'age(yrs): 32.6', 'age(yrs): 49.3'],
    1: ['gender: male', 'gender: --', 'gender: female'],
    2: ['tissue type: Normal', 'tissue type: paraganglioma (PGL) tumor', 'tissue type: pheochromocytoma (PHEO) tumor'],
    3: ['mutation: none', 'mutation: SDHB', 'mutation: SDHD', 'mutation: VHL'],
    4: ['tissue type: adrenal_medulla', 'tissue type: malignant_metastases', 'tissue type: extraadrenal_primary_malignant', 'tissue type: adrenal_primary_benign', 'tissue type: extraadrenal_multiple_benign', 'tissue type: extraadrenal_primary_benign', 'tissue type: adrenal_multiple_benign', 'tissue type: adrenal', 'tissue type: extraadrenal', 'tissue type: Head and neck', 'tissue type: bilateral_adrenal_benign']
}

# Age availability
age_row = 0
# Gender availability
gender_row = 1
# Trait availability
trait_row = 2

# Define conversion functions
def convert_trait(value):
    value = value.split(': ')[1]
    if value == 'Normal':
        return 0
    elif value in ['paraganglioma (PGL) tumor', 'pheochromocytoma (PHEO) tumor']:
        return 1
    else:
        return None

def convert_age(value):
    value = value.split(': ')[1]
    try:
        return float(value)
    except ValueError:
        return None

def convert_gender(value):
    value = value.split(': ')[1]
    if value == 'female':
        return 0
    elif value == 'male':
        return 1
    else:
        return None

# Save metadata
save_cohort_info('GSE39716', './preprocessed/Pheochromocytoma_and_Paraganglioma/cohort_info.json', is_gene_available, trait_row is not None)

# Extract and save clinical data
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Pheochromocytoma_and_Paraganglioma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Pheochromocytoma_and_Paraganglioma/trait_data/GSE39716.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM978080': [0.0, 61.0, 1.0], 'GSM978081': [0, None, None], 'GSM978082': [0.0, 53.0, 0.0], 'GSM978083': [0.0, 72.0, 0.0], 'GSM978084': [0.0, 72.0, 0.0], 'GSM978085': [0.0, 65.0, nan], 'GSM978086': [0.0, 56.0, 1.0], 'GSM978087': [0.0, 62.0, 1.0], 'GSM978088': [1.0, 28.5, 1.0], 'GSM978089': [1.0, 24.0, 1.0], 'GSM978090': [1.0, 46.0, 0.0], 'GSM978091': [1.0, 9.0, 0.0], 'GSM978092': [1.0, 22.8, 1.0], 'GSM978093': [1.0, 26.6, 1.0], 'GSM978094': [1.0, 38.2, 1.0], 'GSM978095': [1.0, 45.6, 0.0], 'GSM978096': [1.0, 36.0, 0.0], 'GSM978097': [1.0, 36.2, 0.0], 'GSM978098': [1.0, 52.0, 1.0], 'GSM978099': [1.0, 12.0, 1.0], 'GSM978100': [1.0, 31.0, 1.0], 'GSM978101': [1.0, 55.0, 1.0], 'GSM978102': [1.0, 35.0, 1.0], 'GSM978103': [1.0, 34.8, 0.0], 'GSM978104': [1.0, 17.0, 1.0], 'GSM978105': [1.0, 30.0, 0.0], 'GSM978106': [1.0, 16.0, 1.0], 'GSM978107': [1.0, 31.0, 0.0], 'GSM978108': [1.0, 26.6, 0.0], 'GSM978109': [1.0, 61.4, 1.0], 'GSM978110': [1.0, 31.8, 1.0], 'GSM978111': [1.0, 32.6, 1.0], 'GSM97811

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['7892501', '7892502', '7892503', '7892504', '7892505', '7892506',
       '7892507', '7892508', '7892509', '7892510', '7892511', '7892512',
       '7892513', '7892514', '7892515', '7892516', '7892517', '7892518',
       '7892519', '7892520'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identifiers of genes to actual gene symbols
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping
gene_data = apply_gene_mapping(gene_data, mapping_df)
