In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rectal_Cancer/GSE123390'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Locally advanced rectal cancer transcriptomic-based secretome analysis according to neoadjuvant chemoradiotherapy response"
!Series_summary	"Most patients with locally advanced rectal cancer (LARC) present incomplete pathological response (pIR) to neoadjuvant chemoradiotherapy (nCRT). Despite the efforts to predict treatment response using tumor-molecular features, as differentially expressed genes, no molecule has proved to be a strong biomarker. The tumor secretome analysis is a promising strategy for biomarkers identification, which can be assessed using transcriptomic data. Here, we performed transcriptomic-based secretome analysis to select potentially secreted proteins using an in silico approach. The tumor expression profile of 28 LARC biopsies carefully selected and collected before nCRT was compared with normal rectal tissues (NT). The expression profile showed no significant differences between cases with complete (pCR) and incomplete re

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = 1  # disease: rectal cancer / normal
age_row = None  # No age data available
gender_row = None  # No gender data available

def convert_trait(value):
    val = value.split(":")[1].strip().lower()
    if val == 'rectal cancer':
        return 1
    elif val == 'normal':
        return 0
    else:
        return None

def convert_age(value):
    return None  # No age data available

def convert_gender(value):
    return None  # No gender data available

save_cohort_info('GSE123390', './preprocessed/Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
selected_clinical_data = geo_select_clinical_features(clinical_data, 'Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Rectal_Cancer/trait_data/GSE123390.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM3502511': [1], 'GSM3502512': [1], 'GSM3502513': [1], 'GSM3502514': [1], 'GSM3502515': [1], 'GSM3502516': [1], 'GSM3502517': [1], 'GSM3502518': [1], 'GSM3502519': [1], 'GSM3502520': [1], 'GSM3502521': [1], 'GSM3502522': [1], 'GSM3502523': [1], 'GSM3502524': [1], 'GSM3502525': [1], 'GSM3502526': [1], 'GSM3502527': [1], 'GSM3502528': [1], 'GSM3502529': [1], 'GSM3502530': [1], 'GSM3502531': [1], 'GSM3502532': [1], 'GSM3502533': [1], 'GSM3502534': [1], 'GSM3502535': [1], 'GSM3502536': [1], 'GSM3502537': [1], 'GSM3502538': [1], 'GSM3502539': [0], 'GSM3502540': [0], 'GSM3502541': [0], 'GSM3502542': [0], 'GSM3502543': [0]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st',
       '2827995_st', '2827996_st', '2828010_st', '2828012_st', '2835442_st',
       '2835447_st', '2835453_st', '2835456_st', '2835459_st', '2835461_st',
       '2839509_st', '2839511_st', '2839513_st', '2839515_st', '2839517_st'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['11869', '29554', '69091', '160446', '317811'], 'stop': ['14409', '31109', '70008', '161525', '328581'], 'total_probes': [49.0, 60.0, 30.0, 30.0, 191.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST0000040838

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Define the appropriate keys for identifiers and gene symbols.
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# Use regex to isolate the gene symbols from the 'gene_assignment' field
def extract_gene_symbol(gene_assignment):
    if isinstance(gene_assignment, str):
        # Extract the first gene symbol before the double slashes and any potential space
        match = re.search(r'([^\s//]+)', gene_assignment)
        return match.group(1) if match else None
    return None

# 2. Get the dataframe storing the mapping between probe IDs and genes.
gene_annotation[gene_symbol_key] = gene_annotation[gene_symbol_key].apply(extract_gene_symbol)
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function, resulting in the gene expression dataframe "gene_data".
gene_data = apply_gene_mapping(gene_data, gene_mapping)

# Optional: Print first few rows to ensure mapping worked properly
print("Mapped Gene Data Preview:")
print(preview_df(gene_data))


Mapped Gene Data Preview:
{'GSM3502511': [4.375828271601627, 4.51962726413055, 6.3105849917546895, 6.15334008396408, 3.88973674643125], 'GSM3502512': [4.39490760617467, 4.64445371075288, 5.835792986869495, 5.98354770965886, 3.89536323384258], 'GSM3502513': [4.381502835459332, 4.54631625129603, 5.927686237399721, 6.13114719255903, 3.8653462076735], 'GSM3502514': [4.369524158408496, 4.264201093806, 5.93110779685552, 6.0992786731946, 3.89199737326514], 'GSM3502515': [4.375495498888517, 4.26007735150604, 6.02233613119528, 6.36680936399693, 3.88139886999176], 'GSM3502516': [4.376097611536612, 4.07766659541613, 6.0331777326165, 6.1369102929632, 3.72229447466592], 'GSM3502517': [4.381671270256264, 4.30337046079875, 5.686709944301896, 6.41424330892923, 3.9226558466897], 'GSM3502518': [4.3581033713220325, 4.16242810636825, 5.919384587867475, 6.24506855605221, 3.95204952259988], 'GSM3502519': [4.375892087732906, 4.64946996552621, 5.73527695300462, 6.15524935241506, 3.79113744275381], 'GSM3502520

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Rectal_Cancer/gene_data/GSE123390.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Rectal_Cancer')

# 4. Save the cohort information.
save_cohort_info('GSE123390', './preprocessed/Rectal_Cancer/cohort_info.json', True, True, trait_biased, unbiased_merged_data)

# 5. If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    csv_path = './preprocessed/Rectal_Cancer/GSE123390.csv'
    unbiased_merged_data.to_csv(csv_path)


No gene data in the dataframe
