In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Testicular_Cancer/GSE12630'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiles of poorly differentiated, undifferentiated and metastatic cancers"
!Series_summary	"The tissue of origin form metastatic tumors is sometimes difficult to identify from clinical and histologic information.  Gene expression signatures are one potential method for identifying the tissue of origin. In the development of algorithms to identify tissue of origin, a collection of human tumor metastatic specimens with known primary sites or primary tumors with poor differentiation are very useful in identifying gene expressions signatures that can classify unknown specimens as to the tissue of origin.  Here we describe a series of 276 such tumor specimens used for this purpose. The specimens are poorly differentiated, undifferentiated and metastatic specimens from tumors of the following types/tissues of origin: breast, liver, non-Hodgkin's lymphoma, non-small cell lung cancer, ovary, testicular germ cell, thyroid, kidney, pancreas

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Part 1: Gene Expression Data Availability
is_gene_available = True  # Based on the GEO dataset description and platform used (Affymetrix GeneChip)

# Example sample characteristics dictionary (should be provided in the actual code)
sample_char_dict = {
    0: ['Anatomical sites: Ovary', 'Anatomical sites: Breast', 'Anatomical sites: Lung', 'Anatomical sites: Lymph node', 'Anatomical sites: Colon', 'Anatomical sites: Thigh mass', 'Anatomical sites: Adrenal gland', 'Anatomical sites: Thyroid', 'Anatomical sites: Stomach', 'Anatomical sites: Liver', 'Anatomical sites: Pancreas', 'Anatomical sites: Omentum', 'Anatomical sites: Pancreas/Duod.', 'Anatomical sites: Spleen/soft tissue', 'Anatomical sites: Thyroid gland', 'Anatomical sites: Skin', 'Anatomical sites: Small intestine', 'Anatomical sites: Small bowel', 'Anatomical sites: Small Bowel', 'Anatomical sites: Spleen', 'Anatomical sites: Stomach/Duod/Pancreas', 'Anatomical sites: Adominal sarcoma', 'Anatomical sites: Kidney', 'Anatomical sites: Soft Tissue', 'Anatomical sites: Testicle', 'Anatomical sites: Adrenal Gland', 'Anatomical sites: Lymph Node', 'Anatomical sites: Umbilical nodule', 'Anatomical sites: Jejunum', 'Anatomical sites: Neck'],
    1: ['gender: F', 'gender: M', 'gender: -'],
    2: ['age: 50-59', 'age: 60-69', 'age: 40-49', 'age: 70-79', 'age: 80-89', 'age: 20-29', 'age: 30-39', 'age: 10-19', 'age: 80-86', 'age: 50-57'],
    3: ['TNM: 3 to 4', 'TNM: 4', 'TNM: -', 'TNM: 3', 'TNM: G2', 'TNM: G3', 'TNM: WHO grade 1', 'TNM: Fuhrman G3', 'TNM: 2 to 3', 'TNM: Fuhrman G4', 'TNM: WHO grade II', 'TNM: Gleason = 7/10', 'TNM: G4', 'TNM: G3-4', 'TNM: Gleason = 6/10']
}

# Part 2: Variable Availability and Data Type Conversion

# 2.1 Data Availability
for key, values in sample_char_dict.items():
    # Check the availability of 'Testicular_Cancer'
    if trait_row is None and any("Testicle" in value for value in values):
        trait_row = key
    
    # Check the availability of 'age'
    if age_row is None and any("age:" in value for value in values):
        age_row = key
    
    # Check the availability of 'gender'
    if gender_row is None and any("gender:" in value for value in values):
        gender_row = key

# 2.3 Data Type Conversion Functions
def convert_trait(value):
    return 1 if "Testicle" in value else 0

def convert_age(value):
    try:
        extracted_age = int(value.split(':')[1].strip().split('-')[0])
        return extracted_age
    except:
        return None

def convert_gender(value):
    if "F" in value:
        return 0
    elif "M" in value:
        return 1
    return None

# Save Metadata
save_cohort_info('GSE12630', './preprocessed/Testicular_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Testicular_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Testicular_Cancer/trait_data/GSE12630.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM319520': [0, 60, 0], 'GSM319521': [0, 60, 1], 'GSM319522': [0, 60, 1], 'GSM319523': [0, 50, 1], 'GSM319524': [0, 50, 0], 'GSM319525': [0, 50, 1], 'GSM319526': [0, 40, 1], 'GSM319527': [0, 60, 1], 'GSM319528': [0, 60, 1], 'GSM319529': [0, 50, 1], 'GSM319530': [0, 50, 1], 'GSM319531': [0, 50, 1], 'GSM319532': [0, 20, 1], 'GSM319533': [0, 40, 1], 'GSM319534': [0, 20, 1], 'GSM319535': [0, 10, 0], 'GSM319536': [0, 60, 1], 'GSM319537': [0, 30, 0], 'GSM319538': [0, 50, 1], 'GSM319539': [0, 40, 0], 'GSM319540': [0, 60, 1], 'GSM319541': [0, 20, 0], 'GSM319542': [0, 50, 1], 'GSM319543': [0, 40, 0], 'GSM319544': [0, 50, 0], 'GSM319545': [0, 20, 0], 'GSM319546': [0, 70, 1], 'GSM319547': [0, 70, 1], 'GSM319548': [0, 10, 0], 'GSM319549': [0, 50, 1], 'GSM319550': [0, 60, 0], 'GSM319551': [0, 60, 0], 'GSM319552': [0, 50, 0], 'GSM319553': [0, 60, 1], 'GSM319554': [0, 60, 1], 'GSM319555': [0, 40, 1], 'GSM319556': [0, 70, 1], 'GSM319557': [0, 70, 1], 'GSM319558': [0, 30, 1], 'GSM319559': [0, 50, 1],

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['AFFX-BioB-3_at', 'AFFX-BioB-5_at', 'AFFX-BioB-M_at', 'AFFX-BioC-3_at',
       'AFFX-BioC-5_at', 'AFFX-BioDn-3_at', 'AFFX-BioDn-5_at',
       'AFFX-CreX-3_at', 'AFFX-CreX-5_at', 'AFFX-DapX-3_at', 'AFFX-DapX-5_at',
       'AFFX-DapX-M_at', 'AFFX-HSAC07/X00351_3_at', 'AFFX-HSAC07/X00351_5_at',
       'AFFX-HSAC07/X00351_M_at', 'AFFX-hum_alu_at',
       'AFFX-HUMGAPDH/M33197_3_at', 'AFFX-HUMGAPDH/M33197_5_at',
       'AFFX-HUMGAPDH/M33197_M_at', 'AFFX-HUMISGF3A/M97935_3_at'],
      dtype='object', name='ID')
