In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rectal_Cancer/GSE145037'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Hypermethylation and downregulation of UTP6 are associated with stemness properties, chemoradiotherapy resistance and prognosis in rectal cancer: A co-expression network analysis"
!Series_summary	"To measure global gene expression in primary locally advanced rectal cancer patients who have undergone CRT and screen valuable biomarkers to predict the effects of CRT.Samples fromprimary locally advanced rectal cancer patients were collected. The effects of chemoradiotherapy were evaluated."
!Series_overall_design	"All patients underwent standard CRT  after signing the chemoradiotherapy agreement; subsequently, they were evaluated in accordance with the AJCC tumor regression grade (TRG).Each samplewas collected before CRT. Each sample was stored in liquid nitrogen until total RNA extraction."
Sample Characteristics Dictionary:
{0: ['tissue: primary rectalcancer'], 1: ['Sex: Male', 'Sex: Female'], 2: ['age: 34', 'age: 66', 'age: 69', 'age: 65', 'age: 72

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
# Based on the background information, it seems like gene expression data might be available.
is_gene_available = True

# 2.1 Data Availability
# Variable 'Rectal_Cancer'
# Sample Characteristics Dictionary: `0: ['tissue: primary rectalcancer']`
trait_row = 0 if len(set(['primary rectalcancer'])) > 1 else None

# Variable 'age'
# Sample Characteristics Dictionary: `2: ['age: 34', 'age: 66', ..., 'age: 73']`
age_row = 2 if len(set(['34', '66', '69', '65', '72', '64', '53', '60', '44', '58', '41', '52', '48', '49', '61', '63', '75', '46', '59', '70', '68', '73'])) > 1 else None

# Variable 'gender'
# Sample Characteristics Dictionary: `1: ['Sex: Male', 'Sex: Female']`
gender_row = 1 if len(set(['Male', 'Female'])) > 1 else None

# 2.3 Data Type Conversion
# Function to convert 'Rectal_Cancer'
def convert_trait(value):
    return 1  # Since all values indicate primary rectal cancer

# Function to convert 'age'
def convert_age(value):
    try:
        return int(value.split(':')[1].strip())
    except:
        return None

# Function to convert 'gender'
def convert_gender(value):
    gender_map = {'Male': 1, 'Female': 0}
    try:
        return gender_map[value.split(':')[1].strip()]
    except:
        return None

# Save Metadata
save_cohort_info('GSE145037', './preprocessed/Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction, only if trait_row is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Rectal_Cancer/trait_data/GSE145037.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
