In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Adrenocortical_Cancer/GSE169253'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"microRNA expression profile of pediatric adrenocortical tumors"
!Series_summary	"Here, we used a microarray technique to provide miRNA expression data of a set of 37 adrenocortical tumors (ACT) and 9 non-neoplastic adrenal controls from Brazilian patients assisted in two treatment centers in the state of São Paulo (HC-FMRP-USP and Centro Infantil Boldrini of Campinas)."
!Series_overall_design	"We identified miRNA signatures associated with pediatric adrenocortical tumors and patients' outcome."
Sample Characteristics Dictionary:
{0: ['tissue: Tumor', 'tissue: Non-neoplastic Adrenal'], 1: ['gender: Female', 'gender: Male', nan], 2: ['age at diagnosis (months): 101', 'age at diagnosis (months): 13', 'age at diagnosis (months): 12', 'age at diagnosis (months): 29', 'age at diagnosis (months): 18', 'age at diagnosis (months): 137', 'age at diagnosis (months): 16', 'age at diagnosis (months): 95', 'age at diagnosis (months): 92', 'age at diagnosis (mon

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import numpy as np
import pandas as pd

# Step 1 output: Sample Characteristics Dictionary
sample_characteristics = {
    0: ['tissue: Tumor', 'tissue: Non-neoplastic Adrenal'],
    1: ['gender: Female', 'gender: Male', np.nan],
    2: ['age at diagnosis (months): 101', 'age at diagnosis (months): 13', np.nan],
    3: ['sandrin stage: 2', 'sandrin stage: 1', 'sandrin stage: 4', np.nan],
    4: ['metastasis: Absent', 'metastasis: Present', np.nan],
    5: ['relapse: present', 'relapse: absent', np.nan],
    6: ['vital status: dead', 'vital status: alive', np.nan],
}

# This dataset contains miRNA as indicated by the title and summary
if "microRNA expression profile" in "!Series_title" or "miRNA expression data" in "!Series_summary":
    is_gene_available = False
else:
    is_gene_available = True

# Check data availability for each variable and assign appropriate row numbers
# 'Adrenocortical_Cancer' information seems to be under the category 'tissue'
if 0 in sample_characteristics and any('Tumor' in val for val in sample_characteristics[0]):
    trait_row = 0

# 'age' at diagnosis information is available and seems continuous
if 2 in sample_characteristics and any('age at diagnosis' in val for val in sample_characteristics[2]):
    age_row = 2

# 'gender' information available and is binary
if 1 in sample_characteristics and any('gender' in val for val in sample_characteristics[1]):
    gender_row = 1

# Define conversion functions
def convert_trait(value):
    if pd.isna(value):
        return None
    val = value.split(":")[1].strip()
    if val == "Tumor":
        return 1
    elif val == "Non-neoplastic Adrenal":
        return 0
    return None

def convert_age(value):
    if pd.isna(value):
        return None
    val = value.split(":")[1].strip()
    if val.isdigit():
        return int(val)
    return None

def convert_gender(value):
    if pd.isna(value):
        return None
    val = value.split(":")[1].strip()
    if val == "Male":
        return 1
    elif val == "Female":
        return 0
    return None

save_cohort_info('GSE169253', './preprocessed/Adrenocortical_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Adrenocortical_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Adrenocortical_Cancer/trait_data/GSE169253.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM5191580': [1, 101, 0], 'GSM5191581': [1, 13, 0], 'GSM5191582': [1, 12, 1], 'GSM5191583': [1, 29, 0], 'GSM5191584': [1, 18, 0], 'GSM5191585': [1, 137, 0], 'GSM5191586': [1, 16, 0], 'GSM5191587': [1, 95, 0], 'GSM5191588': [1, 92, 0], 'GSM5191589': [1, 21, 1], 'GSM5191590': [1, 12, 0], 'GSM5191591': [1, 28, 0], 'GSM5191592': [1, 92, 0], 'GSM5191593': [1, 14, 1], 'GSM5191594': [1, 10, 0], 'GSM5191595': [1, 29, 0], 'GSM5191596': [1, 36, 0], 'GSM5191597': [1, 185, 1], 'GSM5191598': [1, 16, 0], 'GSM5191599': [1, 111, 0], 'GSM5191600': [1, 15, 0], 'GSM5191601': [1, 187, 0], 'GSM5191602': [1, 22, 0], 'GSM5191603': [1, 16, 0], 'GSM5191604': [1, 19, 0], 'GSM5191605': [1, 13, 0], 'GSM5191606': [1, 13, 0], 'GSM5191607': [1, 19, 0], 'GSM5191608': [1, 25, 0], 'GSM5191609': [1, 10, 0], 'GSM5191610': [1, 44, 0], 'GSM5191611': [1, 52, 1], 'GSM5191612': [1, 5, 1], 'GSM5191613': [1, 66, 0], 'GSM5191614': [1, 38, 0], 'GSM5191615': [1, 21, 0], 'GSM5191616': [1, 34, 0], 'GSM5191617': [0, None, None], 'G

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['(-)3xSLv1', 'A_25_P00010019', 'A_25_P00010020', 'A_25_P00010021',
       'A_25_P00010023', 'A_25_P00010037', 'A_25_P00010038', 'A_25_P00010039',
       'A_25_P00010040', 'A_25_P00010041', 'A_25_P00010042', 'A_25_P00010043',
       'A_25_P00010044', 'A_25_P00010047', 'A_25_P00010048', 'A_25_P00010053',
       'A_25_P00010054', 'A_25_P00010062', 'A_25_P00010063', 'A_25_P00010070'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True
