In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Colon_and_Rectal_Cancer/GSE46517'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Human melanoma samples comparing nevi and primary and metastatic melanoma"
!Series_summary	"We sought to identify genes and gene signatures which correlate with progression by sampling human melanomas from nevi, primary, and metastatic tumors. The large number of samples also permits analysis within groups."
!Series_overall_design	"Human melanoma samples were isolated from historical frozen patient specimens. RNA was extracted and run on the human Affymetrix U133A microarray chip."
Sample Characteristics Dictionary:
{0: ['tissue type: Metastatic Melanoma', 'tissue type: Primary Melanoma', 'tissue type: Nevus', 'tissue type: Normal Skin', 'tissue type: Normal Epithelial Melanocytes'], 1: [nan, 'patient id: 35', 'patient id: 23', 'patient id: 13', 'patient id: 6', 'patient id: 40', 'patient id: 26', 'patient id: 1', 'patient id: 22', 'patient id: 4', 'patient id: 30', 'patient id: 5', 'patient id: 15', 'patient id: 20', 'patient id: 17', 'patient id

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

import numpy as np

# Step 1: Identify if gene expression data is available
series_design = "!Series_overall_design\t\"Human melanoma samples were isolated from historical frozen patient specimens. RNA was extracted and run on the human Affymetrix U133A microarray chip.\""
if "Affymetrix U133A microarray" in series_design:
    is_gene_available = True

# Step 2.1: Identify available data variables and respective rows
sample_characteristics = {
    0: ['tissue type: Metastatic Melanoma', 'tissue type: Primary Melanoma', 'tissue type: Nevus', 'tissue type: Normal Skin', 'tissue type: Normal Epithelial Melanocytes'],
    8: [np.nan, 'location: cutaneous', 'organ: skin', 'location: cutaneous/subcutaneous', 'location: subcutaneous', 'organ: Lymph Node', 'date of resection: Feb-93', 'organ: colon/peritoneal', 'location: Lymph Node', 'gender: male', 'gender: female'],
    19: [np.nan, 'subsequent metastasis: 7/00 cutis, spleen; 9/01 cutis, liver.', 'subsequent metastasis: 8/97 brain, progression of pulmonal mets, progression of LNN mets axillar and mediastinal, suspected adrenal mets, suspected liver mets, progression of cutaneous mets', 'prior ctx, rtx, itx: 2 cycles polychemotherapy (DTIC, Cisplatin, Vindesine)', 'subsequent metastasis: 9/99 brain, LNN', 'subsequent metastasis: 2/02 lung, soft tissue, GI', 'first instance [fi] or recurrance [r]: FI', 'prior ctx, rtx, itx: from 8/99 4 cycles of polychemotherapy (Vindesine + Cisplatin), radiation therapy (Lnn left axilla) 12/99', 'prior ctx, rtx, itx: no further clinical data', 'subsequent metastasis: 10/94: lungs', 'prior ctx, rtx, itx: 10 cycles of polychemotherapy (DTIC + Vindesine + Carboplatin), 3 cycles of polychemotherapy (Taxol+Tamoxifen+Cisplatin), radiation therapy brain (30 Gy)', 'subsequent metastasis: 7/98: lungs, intraabdominal, s.c., brain', 'subsequent metastasis: no further clinical data', 'subsequent metastasis: 7/00 cutis, spleen; 9/01 cutis, liver.', 'subsequent metastasis: 8/97 brain, progression of known mets (s.c., lungs, liver, peritoneum, mediastinum)', 'subsequent metastasis: 9/01 Lnn cervical right; 3/02 brain (single lesion) g-knife; 11/02 progression of brain met: chemotherapy Temodal'],
    9: [np.nan, 'organ: skin', 'date of resection: May-96', 'age at time of resection: 30y 6m', 'date of resection: Jan-95', 'date of resection: Sep-95', 'age at time of resection: 28y', 'age at time of resection: 59y 7m'],
    12: [np.nan, 'gender: female', 'gender: male', 'race: caucasian', 'clinical stage: 4 new AJCC', 'prior ctx, rtx, itx: no'],
}

# Colon and Rectal Cancer trait availability check
for key, values in sample_characteristics.items():
    if any(isinstance(value, str) and ('colon' in value.lower() or 'rectal' in value.lower()) for value in values):
        trait_row = key

# Age variable availability check
for key, values in sample_characteristics.items():
    if any(isinstance(value, str) and 'age' in value.lower() for value in values):
        age_row = key

# Gender variable availability check
for key, values in sample_characteristics.items():
    if any(isinstance(value, str) and 'gender' in value.lower() for value in values):
        gender_row = key

# Data Type Conversion Functions
def convert_trait(value):
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return None
    trait_values = ['colon', 'rectal']
    for trait in trait_values:
        if trait in value.lower():
            return 1
    return 0

def convert_age(value):
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return None
    try:
        age_str = value.split(':')[-1].strip().split('y')[0]
        return float(age_str)
    except:
        return None

def convert_gender(value):
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return None
    if 'female' in value.lower():
        return 0
    elif 'male' in value.lower():
        return 1
    return None

# Save cohort info
save_cohort_info('GSE46517', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if trait_row is not None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Colon_and_Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Colon_and_Rectal_Cancer/trait_data/GSE46517.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1131566': [None, None, None], 'GSM1131567': [None, None, None], 'GSM1131568': [None, None, None], 'GSM1131569': [None, None, None], 'GSM1131570': [None, None, None], 'GSM1131571': [None, None, None], 'GSM1131572': [None, None, None], 'GSM1131573': [None, None, None], 'GSM1131574': [None, None, None], 'GSM1131575': [None, None, None], 'GSM1131576': [None, None, None], 'GSM1131577': [None, None, None], 'GSM1131578': [None, None, None], 'GSM1131579': [None, None, None], 'GSM1131580': [None, None, None], 'GSM1131581': [None, None, None], 'GSM1131582': [None, None, None], 'GSM1131583': [None, None, None], 'GSM1131584': [None, None, None], 'GSM1131585': [None, None, None], 'GSM1131586': [None, None, None], 'GSM1131587': [0, None, 0], 'GSM1131588': [0, None, 1], 'GSM1131589': [0, None, 1], 'GSM1131590': [0, None, None], 'GSM1131591': [0, None, None], 'GSM1131592': [0, None, 1], 'GSM1131593': [0, None, 0], 'GSM1131594': [0, None, 1], 'GSM1131595': [0, None, None], 'GSM1131596': [0, None, 

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at', '1438_at', '1487_at',
       '1494_f_at', '1598_g_at', '160020_at', '1729_at', '1773_at', '177_at',
       '179_at', '1861_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X69699 /FEATURE= /DEF

### Step 6: Gene Identifier Mapping

In [7]:
# Substep 1: Define the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'Gene Symbol'

# Substep 2: Get the dataframe storing the mapping between probe IDs and genes
gene_mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# Substep 3: Apply the mapping to get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, gene_mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Colon_and_Rectal_Cancer/gene_data/GSE46517.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# Ensure that there is valid data available for 'Colon_and_Rectal_Cancer' before proceeding
if 'Colon_and_Rectal_Cancer' in merged_data.columns and not merged_data['Colon_and_Rectal_Cancer'].isnull().all():
    # 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
    trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Colon_and_Rectal_Cancer')

    # 4. Save the cohort information.
    save_cohort_info('GSE46517', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', True, True, trait_biased, merged_data)
    
    if not trait_biased:
        # 5. If the trait is not severely biased, save the merged data to a csv file.
        csv_path = './preprocessed/Colon_and_Rectal_Cancer/GSE46517.csv'
        unbiased_merged_data.to_csv(csv_path)
else:
    # Handle the case where no valid data is available for 'Colon_and_Rectal_Cancer'
    print("No valid data available for 'Colon_and_Rectal_Cancer'. Skipping bias check and saving steps.")
    
    # Save cohort information with a note about the lack of valid data
    save_cohort_info('GSE46517', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', True, False, note='No valid data for Colon_and_Rectal_Cancer')


No valid data available for 'Colon_and_Rectal_Cancer'. Skipping bias check and saving steps.
A new JSON file was created at: ./preprocessed/Colon_and_Rectal_Cancer/cohort_info.json
