In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rectal_Cancer/GSE139255'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A Nine-gene Signature for Predicting the Response to Preoperative Chemoradiotherapy in Patients with Locally Advanced Rectal Cancer"
!Series_summary	"A total of 156 LARC patients (training cohort n = 60; validation cohort n = 96) were included in the study who underwent surgical resection post PCRT. By using univariate and multivariate logistic regression, we identified a 9-gene signature that differentiated between responders and non-responders. ; The novel 9-gene signature is robust in predicting response to PCRT in LARC patients. Tailored treatment approaches in good and poor responders to PCRT may improve the oncologic outcomes of patients with LARC."
!Series_overall_design	"The study included 156 randomly selected rectal cancer patients (n = 156) divided into a training cohort (n = 60) and a validation cohort (n = 96). Surgical resection was performed 6–8 weeks after completion of PCRT and included local excision and radical resection perform

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check gene expression data availability
is_gene_available = True  # Given the dataset includes gene expression analysis

# Check variable availability
sample_characteristics_dict = {
    0: ['histology: Non-Response', 'histology: Good-Response']
}

# Analyze sample characteristics to find variables
if len(set(value.split(': ')[1] for value in sample_characteristics_dict[0])) > 1:
    trait_row = 0

# Define conversion functions
def convert_trait(value):
    try:
        value = value.split(': ')[1].strip()
        return 1 if value == 'Non-Response' else 0
    except (IndexError, ValueError):
        return None

# Age and gender keys are not available in the provided dictionary
def convert_age(value):
    return None  # No conversion since age_row is None

def convert_gender(value):
    return None  # No conversion since gender_row is None

# Save cohort information
save_cohort_info('GSE139255', './preprocessed/Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Rectal_Cancer/trait_data/GSE139255.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4134743': [1], 'GSM4134744': [1], 'GSM4134745': [0], 'GSM4134746': [1], 'GSM4134747': [1], 'GSM4134748': [0], 'GSM4134749': [1], 'GSM4134750': [1], 'GSM4134751': [1], 'GSM4134752': [1], 'GSM4134753': [1], 'GSM4134754': [1], 'GSM4134755': [0], 'GSM4134756': [1], 'GSM4134757': [0], 'GSM4134758': [0], 'GSM4134759': [1], 'GSM4134760': [0], 'GSM4134761': [0], 'GSM4134762': [0], 'GSM4134763': [1], 'GSM4134764': [1], 'GSM4134765': [1], 'GSM4134766': [1], 'GSM4134767': [1], 'GSM4134768': [1], 'GSM4134769': [1], 'GSM4134770': [1], 'GSM4134771': [1], 'GSM4134772': [1], 'GSM4134773': [0], 'GSM4134774': [0], 'GSM4134775': [0], 'GSM4134776': [0], 'GSM4134777': [0], 'GSM4134778': [1], 'GSM4134779': [1], 'GSM4134780': [1], 'GSM4134781': [0], 'GSM4134782': [0], 'GSM4134783': [1], 'GSM4134784': [1], 'GSM4134785': [1], 'GSM4134786': [0], 'GSM4134787': [0], 'GSM4134788': [0], 'GSM4134789': [0], 'GSM4134790': [0], 'GSM4134791': [0], 'GSM4134792': [1], 'GSM4134793': [0], 'GSM4134794': [1], 'GSM4134795

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['ABL1', 'ACAD9', 'ACVR1B', 'ACVR1C', 'ACVR2A', 'AGK', 'AKT1', 'AKT2',
       'AKT3', 'ALK', 'ALKBH2', 'ALKBH3', 'AMER1', 'AMH', 'AMMECR1L', 'ANGPT1',
       'APC', 'APH1B', 'AR', 'ARID1A'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = False


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))
    

# Since requires_gene_mapping is False, the content of STEP6 should be skipped entirely.


Gene annotation preview:
{'ID': ['ABL1', 'ACVR1B', 'ACVR1C', 'ACVR2A', 'AKT1'], 'GB_ACC': ['NM_005157.3', 'NM_004302.3', 'NM_145259.2', 'NM_001616.3', 'NM_005163.2'], 'Probe_class': ['Endogenous', 'Endogenous', 'Endogenous', 'Endogenous', 'Endogenous'], 'SEQUENCE': ['CTGCGTGAGCTATGTGGATTCCATCCAGCAAATGAGGAACAAGTTTGCCTTCCGAGAGGCCATCAACAAACTGGAGAATAATCTCCGGGAGCTTCAGATC', 'CGGGAGAGACTCGCTCACTCCCATGTTGGGTTTGAGACAGACACCTTTTCTATTTACCTCCTAATGGCATGGAGACTCTGAGAGCGAATTGTGTGGAGAA', 'GGAATTTTGCCACCATGTGACTTATTGGGGCAGAGAAAACTCAGGGTTGTCTTTGAGTCTGCACAAAAGCACCAGGGAACCTGCTTAGCAAATCGTCTGA', 'ACATGGCTCCAGAGGTATTAGAGGGTGCTATAAACTTCCAAAGGGATGCATTTTTGAGGATAGATATGTATGCCATGGGATTAGTCCTATGGGAACTGGC', 'TTCTTTGCCGGTATCGTGTGGCAGCACGTGTACGAGAAGAAGCTCAGCCCACCCTTCAAGCCCCAGGTCACGTCGGAGACTGACACCAGGTATTTTGATG'], 'SPOT_ID': [nan, nan, nan, nan, nan]}


### Step 7: Data Normalization and Merging

In [7]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Rectal_Cancer/gene_data/GSE139255.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Rectal_Cancer')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE139255', './preprocessed/Rectal_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Rectal_Cancer/GSE139255.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Rectal_Cancer', the least common label is '1.0' with 67 occurrences. This represents 42.95% of the dataset.
The distribution of the feature 'Rectal_Cancer' in this dataset is fine.

