In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Colon_and_Rectal_Cancer/GSE15781'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"New specific molecular targets for radiochemotherapy in colorectal cancer"
!Series_summary	"A promising treatment for patients with advanced colorectal cancer is preoperative radiochemotherapy. The early side effects of this treatment have been considered to be acceptable. The aim of this study was to identify the effects of preoperative radiochemotherapy (PRT) on gene expression in tumour and normal colon rectal tissue form the same patients, before and after PRT.  For that purpose, tissue samples from ten patients with operable rectal adenocarcinomas were collected for use in whole genome–microarray based gene expression analysis. A factorial experimental design allowed us to look solely at the radiation effect on tumours. This resulted in 4496 differentially expressed genes in tumour tissue with p<0.05. In addition to known markers for radiochemotherapy, a Gene Set Enrichment Analysis (GSEA) showed a significant enrichment in gene sets associat

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# First, determine if gene expression data is available
is_gene_available = True  # Based on text, gene expression data is present

# Data Availability for 'Colon_and_Rectal_Cancer', 'age', 'gender'
trait_row = 1  # Using 'treatment: non-irradiated', 'treatment: irradiated' to infer the cancer status, setting key to 1
age_row = None  # No explicit or inferred age data available
gender_row = None  # No explicit or inferred gender data available

# Define data conversion functions
def convert_trait(value):
    try:
        treatment = value.split(":")[1].strip()
        if treatment == 'non-irradiated':
            return 0
        elif treatment == 'irradiated':
            return 1
        else:
            return None
    except IndexError:
        return None

# Since no age or gender data are available, only define the convert_trait function
convert_age = lambda value: None  # no conversion needed, data not available
convert_gender = lambda value: None  # no conversion needed, data not available

# Save Metadata
save_cohort_info('GSE15781', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if trait_row is not None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Colon_and_Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Colon_and_Rectal_Cancer/trait_data/GSE15781.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM396309': [0], 'GSM396310': [0], 'GSM396311': [0], 'GSM396312': [0], 'GSM396313': [0], 'GSM396314': [0], 'GSM396315': [0], 'GSM396316': [0], 'GSM396317': [0], 'GSM396318': [0], 'GSM396319': [0], 'GSM396320': [0], 'GSM396321': [0], 'GSM396322': [0], 'GSM396323': [0], 'GSM396324': [0], 'GSM396325': [0], 'GSM396326': [0], 'GSM396327': [0], 'GSM396328': [0], 'GSM396329': [0], 'GSM396330': [0], 'GSM396331': [0], 'GSM396332': [1], 'GSM396333': [1], 'GSM396334': [1], 'GSM396335': [1], 'GSM396336': [1], 'GSM396337': [1], 'GSM396338': [1], 'GSM396339': [1], 'GSM396340': [1], 'GSM396341': [1], 'GSM396342': [1], 'GSM396343': [1], 'GSM396344': [1], 'GSM396345': [1], 'GSM396346': [1], 'GSM396347': [1], 'GSM396348': [1], 'GSM396349': [1], 'GSM396350': [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['100002', '100003', '100027', '100036', '100037', '100039', '100044',
       '100045', '100051', '100052', '100057', '100058', '100060', '100062',
       '100064', '100079', '100089', '100093', '100095', '100100'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['156427', '139282', '131316', '235751', '696770'], 'GeneID': ['hCG1811459.2', 'hCG2022854', 'hCG20768.4', 'hCG2024647', 'hCG2038184'], 'GENE': [nan, '2596', '10243', nan, nan], 'ORF_LIST': [nan, '2596', '10243', nan, nan], 'Gene Name': [nan, 'growth associated protein 43', 'gephyrin', nan, nan], 'Gene Symbol': [nan, 'GAP43', 'GPHN', nan, nan], 'Celera Transcript': ['hCT2328995.1,hCT1643564.3', 'hCT2319628,hCT2319630', 'hCT11848.4,hCT1955809.2', 'hCT2322598', 'hCT2342610'], 'GB_LIST': [nan, 'NM_002045.2', 'NM_020806.3', nan, nan], 'MGC': [nan, 'BC007936.2', 'BC030016.2', nan, nan], 'GenBank': [nan, 'CR612258.1,M25667.1,BT019771.1,CR614045.1,CR607941.1', 'AJ272033.1,AB037806.1,AF272663.1,AJ272343.1', nan, nan], 'Ensembl Transcript': [nan, 'ENST00000358762,ENST00000305124', 'ENST00000315266,ENST00000305960', nan, nan], 'ESTs': [nan, 'AL519551.3,BG184834.1,CD613833.1,BM720621.1,BP199413.1', 'AA232725.1,CD630404.1,CX784062.1,BX480248.1,CR736255.1', nan, nan]

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys for probe IDs and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'Gene Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Colon_and_Rectal_Cancer/gene_data/GSE15781.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Colon_and_Rectal_Cancer')

# 4. Save the cohort information.
save_cohort_info('GSE15781', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Colon_and_Rectal_Cancer/GSE15781.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Colon_and_Rectal_Cancer', the least common label is '1.0' with 19 occurrences. This represents 45.24% of the dataset.
The distribution of the feature 'Colon_and_Rectal_Cancer' in this dataset is fine.

