In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Pheochromocytoma_and_Paraganglioma/GSE29742'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"MIRnome Profiling of Hereditary Pheochromocytoma and Paraganglioma Reveals Specific Signatures According to Primary Mutation: Possible Implications in the Differentiation Status of Tumor Cells"
!Series_summary	"Pheochromocytoma (PCC) and paraganglioma (PGL) are rare neuroendocrine neoplasias of neural crest origin. They can be part of several syndromes, and their mRNA profile is dependent on genetic background, but questions related to clinical behavior or even main location remain unanswered. MicroRNAs are key modulators of target genes through translational repression, mRNA degradation, or both, and therefore they could resolve some of these issues. To determine the role microRNAs play in tumorigenesis and progression of PCC/PGL, as well as to identify microRNA biomarkers specifically related to different PCC/PGL genetic classes known so far, we characterized microRNA profiles in a large series of frozen tumors with germline mutations in SDHD, S

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# According to the background information, only miRNA expression data is available.
is_gene_available = False

# 2. Variable Availability and Data Type Conversion
# Check sample characteristics
sample_characteristics_dict = {
    0: ['tissue: adrenal pheochromocytoma', 'tissue: thoracic paraganglioma', 'tissue: abdominal paraganglioma', 'tissue: carotid paraganglioma', 'tissue: normal adrenal medulla'],
    1: ['primary mutation: SDHB', 'primary mutation: SDHD', 'primary mutation: RET', 'primary mutation: VHL', 'primary mutation: NF1', 'primary mutation: TMEM127', 'primary mutation: FPCC', 'primary mutation: none'],
    2: ['gender: male', 'gender: female', 'gender: na'],
    3: ['age: 22', 'age: 14', 'age: 32', 'age: na', 'age: 13', 'age: 52', 'age: 11', 'age: 10', 'age: 21', 'age: 20', 'age: 17', 'age: 31', 'age: 24', 'age: 29', 'age: 34', 'age: 18', 'age: 45', 'age: 36', 'age: 39', 'age: 61', 'age: 15', 'age: 25', 'age: 37', 'age: 30', 'age: 47', 'age: 48', 'age: 38', 'age: 58', 'age: 54', 'age: 46']
}

def get_key_from_value(value_list, target_list):
    for key, values in value_list.items():
        if any(item for item in values if item in target_list):
            return key
    return None

# Check availability of Pheochromocytoma_and_Paraganglioma
trait_row = get_key_from_value(sample_characteristics_dict, ['tissue: adrenal pheochromocytoma', 'tissue: thoracic paraganglioma', 'tissue: abdominal paraganglioma', 'tissue: carotid paraganglioma'])

# Check availability of age
age_row = get_key_from_value(sample_characteristics_dict, ['age: '])

# Check availability of gender
gender_row = get_key_from_value(sample_characteristics_dict, ['gender: male', 'gender: female', 'gender: na'])

# 2.3 Data Type Conversion
# For Pheochromocytoma_and_Paraganglioma (binary)
def convert_trait(value):
    value = value.split(': ')[-1]
    if 'pheochromocytoma' in value.lower() or 'paraganglioma' in value.lower():
        return 1
    elif 'normal adrenal medulla' in value.lower():
        return 0
    else:
        return None

# For age (continuous)
def convert_age(value):
    value = value.split(': ')[-1]
    try:
        return int(value)
    except ValueError:
        return None

# For gender (binary)
def convert_gender(value):
    value = value.split(': ')[-1]
    if value.lower() == 'male':
        return 1
    elif value.lower() == 'female':
        return 0
    else:
        return None

# 3. Save Metadata
save_cohort_info('GSE29742', './preprocessed/Pheochromocytoma_and_Paraganglioma/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Pheochromocytoma_and_Paraganglioma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Pheochromocytoma_and_Paraganglioma/trait_data/GSE29742.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM737370': [1, 1], 'GSM737371': [1, 1], 'GSM737372': [1, 0], 'GSM737373': [1, 1], 'GSM737374': [1, 1], 'GSM737375': [1, 1], 'GSM737376': [1, 0], 'GSM737377': [1, 1], 'GSM737378': [1, 0], 'GSM737379': [1, 0], 'GSM737380': [1, 0], 'GSM737381': [1, 0], 'GSM737382': [1, 1], 'GSM737383': [1, 0], 'GSM737384': [1, 0], 'GSM737385': [1, 0], 'GSM737386': [1, 1], 'GSM737387': [1, 0], 'GSM737388': [1, 0], 'GSM737389': [1, 0], 'GSM737390': [1, 1], 'GSM737391': [1, 1], 'GSM737392': [1, 0], 'GSM737393': [1, 1], 'GSM737394': [1, 1], 'GSM737395': [1, 1], 'GSM737396': [1, 1], 'GSM737397': [1, 1], 'GSM737398': [1, 1], 'GSM737399': [1, 0], 'GSM737400': [1, 1], 'GSM737401': [1, 0], 'GSM737402': [1, 1], 'GSM737403': [1, 0], 'GSM737404': [1, 1], 'GSM737405': [1, 1], 'GSM737406': [1, 0], 'GSM737407': [1, 1], 'GSM737408': [1, 0], 'GSM737409': [1, 0], 'GSM737410': [1, 0], 'GSM737411': [1, None], 'GSM737412': [1, 0], 'GSM737413': [1, 0], 'GSM737414': [1, 0], 'GSM737415': [1, 1], 'GSM737416': [1, 0], 'GSM73741