In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Adrenocortical_Cancer/GSE21660'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Advancing a Clinically Relevant Perspective of the Clonal Nature of Cancer"
!Series_summary	"We used DNA content-based flow cytometry to distinguish and isolate nuclei from clonal populations in primary tissues from three disparate cancers with variable clinical histories. We then developed a methodology to adapt flow cytometrically purified nuclei samples for use with whole genome technologies including aCGH and next generation sequencing (NGS). Our results demonstrate that selected aberrations in the genomes of distinct clonal populations in each patient create clinically relevant contexts at least with respect to the cancer types profiled in this study."
!Series_overall_design	"We applied DNA content based flow sorting to isolate the nuclei of clonal populations from tumor biopsies. Genomic DNA from each sorted population was amplified with phi29 polymerase. A 1ug aliquot of each amplified sample was digested with DNAse 1 then labeled with Cy5 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if this dataset contains gene expression data
# Given that the dataset mentions aCGH arrays, it is not gene expression data
is_gene_available = False

# Check for the availability of data for 'Adrenocortical_Cancer', 'age', and 'gender'
sample_characteristics_dict = {0: ['tissue: Pancreatic Ductal Adenocarcinoma', 'tissue: Adrenal Cortical Carcinoma', 'tissue: Prostate Carcinoma']}

# Adrenocortical_Cancer is available, set the respective row and conversion function
if 'tissue: Adrenal Cortical Carcinoma' in sample_characteristics_dict[0]:
    trait_row = 0

    def convert_trait(value):
        if 'Adrenal Cortical Carcinoma' in value:
            return 1
        return 0

# Age data is not available in the given sample characteristics dictionary
age_row = None

# Gender data is not available in the given sample characteristics dictionary
gender_row = None

# Define dummy conversion functions for unavailable data
def convert_age(value):
    return None

def convert_gender(value):
    return None

# Save cohort information
save_cohort_info('GSE21660', './preprocessed/Adrenocortical_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Adrenocortical_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Adrenocortical_Cancer/trait_data/GSE21660.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM540550': [0], 'GSM540551': [0], 'GSM540552': [0], 'GSM540553': [0], 'GSM540554': [0], 'GSM540555': [0], 'GSM540556': [0], 'GSM540557': [0], 'GSM540558': [0], 'GSM540559': [0], 'GSM540560': [0], 'GSM540561': [0], 'GSM540562': [0], 'GSM540563': [0], 'GSM540564': [0], 'GSM540565': [0], 'GSM540566': [0], 'GSM540567': [0], 'GSM540568': [0], 'GSM540569': [0], 'GSM540570': [0], 'GSM540571': [0], 'GSM540572': [0], 'GSM540573': [0], 'GSM540574': [0], 'GSM540575': [0], 'GSM540576': [0], 'GSM540577': [0], 'GSM540578': [0], 'GSM540579': [1], 'GSM540580': [1], 'GSM540581': [1], 'GSM540582': [1], 'GSM540583': [1], 'GSM540584': [1], 'GSM540585': [1], 'GSM540586': [1], 'GSM540587': [1], 'GSM540588': [1], 'GSM540589': [1], 'GSM540590': [1], 'GSM540591': [1], 'GSM540592': [1], 'GSM540593': [1], 'GSM540594': [1], 'GSM540595': [1], 'GSM540596': [1], 'GSM540597': [1], 'GSM540598': [1], 'GSM540599': [1], 'GSM540600': [1], 'GSM540601': [1], 'GSM540602': [1], 'GSM540603': [1], 'GSM540604': [0], 'GSM54060