In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Colon_and_Rectal_Cancer/GSE46862'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Predicting multi-class responses to preoperative chemoradiotherapy in rectal patients"
!Series_summary	"The treatment strategy of rectal cancer has substantially changed in recent decades. Historically postoperative chemoradiotherapy (CRT) was considered to be the first-line therapy for stage II and III rectal cancers. However, the preoperative CRT is now considered to be the optimal therapy regimen for locally advanced rectal ancer due to its improved local control, reduced toxicity, and increased rate of sphincter preservation. Our study established a clinically practical multi-class prediction model by adopting a novel strategy that applies two separate prediction models (MI and TO predictor) sequentially to a patient to predict the response. For promising clinical practice, we validated our model in a published dataset, which is completely independent dataset from ours. This study suggests a clinically plausible prediction model that correctly

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Gene Expression Data Availability
is_gene_available = True

# Variable Availability and Data Type Conversion
# Colon_and_Rectal_Cancer
trait_row = 0

# Age
age_row = 1

# Gender
gender_row = 2

# Data Type Conversion Functions
def convert_trait(value):
    # Extract the value after the colon
    v = value.split(':')[1].strip()
    if v == 'MO':
        return 0
    elif v == 'TO':
        return 1
    elif v == 'MI':
        return 2
    elif v == 'NT':
        return 3
    else:
        return None

def convert_age(value):
    try:
        return int(value.split(':')[1].strip())
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    gender = value.split(':')[1].strip().lower()
    if gender == 'female':
        return 0
    elif gender == 'male':
        return 1
    else:
        return None

# Save Metadata
save_cohort_info('GSE46862', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
selected_clinical_data = geo_select_clinical_features(clinical_data, 'Colon_and_Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Colon_and_Rectal_Cancer/trait_data/GSE46862.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM1139299': [0, 68, 1], 'GSM1139300': [1, 58, 1], 'GSM1139301': [0, 66, 1], 'GSM1139302': [1, 56, 1], 'GSM1139303': [0, 55, 0], 'GSM1139304': [0, 50, 0], 'GSM1139305': [0, 37, 1], 'GSM1139306': [1, 59, 0], 'GSM1139307': [2, 46, 0], 'GSM1139308': [2, 68, 1], 'GSM1139309': [0, 49, 0], 'GSM1139310': [3, 62, 1], 'GSM1139311': [0, 65, 0], 'GSM1139312': [0, 63, 1], 'GSM1139313': [0, 41, 0], 'GSM1139314': [1, 33, 1], 'GSM1139315': [0, 49, 1], 'GSM1139316': [3, 50, 0], 'GSM1139317': [0, 73, 1], 'GSM1139318': [0, 63, 1], 'GSM1139319': [0, 70, 1], 'GSM1139320': [0, 69, 1], 'GSM1139321': [1, 39, 0], 'GSM1139322': [0, 58, 1], 'GSM1139323': [3, 41, 0], 'GSM1139324': [0, 43, 1], 'GSM1139325': [0, 48, 1], 'GSM1139326': [3, 72, 1], 'GSM1139327': [2, 76, 1], 'GSM1139328': [3, 68, 0], 'GSM1139329': [3, 62, 1], 'GSM1139330': [0, 50, 0], 'GSM1139331': [3, 40, 0], 'GSM1139332': [1, 62, 1], 'GSM1139333': [0, 54, 0], 'GSM1139334': [1, 58, 1], 'GSM1139335': [0, 45, 1], 'GSM1139336': [0, 73, 1], 'GSM1139337

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['7892501', '7892502', '7892503', '7892504', '7892505', '7892506',
       '7892507', '7892508', '7892509', '7892510', '7892511', '7892512',
       '7892513', '7892514', '7892515', '7892516', '7892517', '7892518',
       '7892519', '7892520'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Define the keys for the probe identifiers and gene symbols based on the preview of gene annotation
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes
probe_to_gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to the gene expression data
gene_data = apply_gene_mapping(gene_data, probe_to_gene_mapping)

print("Gene data after mapping preview:")
print(preview_df(gene_data))


Gene data after mapping preview:
{'GSM1139299': [6.0523481225098985, 6.536194308100727, 4.005890377094505, 6.497122513124082, 7.448991839793297], 'GSM1139300': [6.094628254960819, 6.470537775023492, 4.007533607891052, 6.25312500570329, 7.615162525039106], 'GSM1139301': [5.998134685363899, 6.281897021594786, 4.1542849236784924, 6.121602704345316, 6.936561004083798], 'GSM1139302': [6.153056304509211, 6.537054692069145, 3.939772676421036, 6.416172296181883, 7.728249519530726], 'GSM1139303': [6.092176142817982, 6.546724839892862, 3.9568058935281, 6.203303027406402, 7.771858159340782], 'GSM1139304': [6.102293494534644, 6.494618688520755, 3.9649143562408162, 6.186475356535319, 7.643937463849162], 'GSM1139305': [6.085009725594996, 6.532873321622068, 3.9916401328043953, 6.412716849749209, 7.552523009307263], 'GSM1139306': [6.187989092055679, 6.4570900184478575, 4.050270939673469, 6.157667453850062, 7.474962814988826], 'GSM1139307': [6.134121716068187, 6.526955872042526, 3.950217875433281, 6.27