In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rectal_Cancer/GSE119409'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression data from rectal cancer"
!Series_summary	"A supervised method (Significance Analysis of Microarrays -SAM-) was used to find statistically significance (adjusted p<0.05) in differentially expressed genes between responding and non-responding groups."
!Series_overall_design	"To further investigate the correlation between gene expression and response to neoadjuvant radiotherapy, mRNA expression in pre-therapy biopsies was profiled into responding and non-responding groups."
Sample Characteristics Dictionary:
{0: ['disease state: rectal cancer'], 1: ['tissue: rectal cancer biopsy'], 2: ['sensitivity: sensitive', 'sensitivity: unknown', 'sensitivity: resistant'], 3: ['patient age: 52', 'patient age: 57', 'patient age: 65', 'patient age: 61', 'patient age: 62', 'patient age: 58', 'patient age: 63', 'patient age: 70', 'patient age: 74', 'patient age: 72', 'patient age: 51', 'patient age: 45', 'patient age: 77', 'patient age: 64', 'patient age:

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Sample characteristics based on the output of STEP 1
sample_characteristics = {
    0: ['disease state: rectal cancer'], 
    1: ['tissue: rectal cancer biopsy'], 
    2: ['sensitivity: sensitive', 'sensitivity: unknown', 'sensitivity: resistant'], 
    3: ['patient age: 52', 'patient age: 57', 'patient age: 65', 'patient age: 61', 'patient age: 62', 'patient age: 58', 'patient age: 63', 'patient age: 70', 'patient age: 74', 'patient age: 72', 'patient age: 51', 'patient age: 45', 'patient age: 77', 'patient age: 64', 'patient age: 66', 'patient age: 43', 'patient age: 39', 'patient age: 71', 'patient age: 35', 'patient age: 42', 'patient age: 56', 'patient age: 40', 'patient age: 67', 'patient age: 47', 'patient age: 69', 'patient age: 50', 'patient age: 49', 'patient age: 44', 'patient age: 37', 'patient age: unknown'], 
    4: ['tumor stage: T3N0M0', 'tumor stage: T4N2M0', 'tumor stage: T3N2M0', 'tumor stage: T3N1M0', 'tumor stage: T3N2MO', 'tumor stage: T3N0MO', 'tumor stage: T2N1MO', 'tumor stage: T2N1M0', 'tumor stage: T2N0M0', 'tumor stage: unknown']
}

# Check dataset to determine if gene expression data is available
if any("expression" in val.lower() for key, value in sample_characteristics.items() if isinstance(value, list) for val in value):
    is_gene_available = True

# Determine the availability of the variables 'Rectal_Cancer', 'age', and 'gender'
if any("sensitivity:" in item for item in sample_characteristics.get(2, [])):
    trait_row = 2  # Sensitivity is related to rectal cancer response

if any("age:" in item for item in sample_characteristics.get(3, [])):
    age_row = 3

# Gender data is not available based on the given sample characteristics

# Define data conversion functions
def convert_trait(value):
    if 'sensitivity: sensitive' in value:
        return 1
    elif 'sensitivity: resistant' in value:
        return 0
    else:
        return None

def convert_age(value):
    try:
        age = int(value.split(':')[1].strip())
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    return None  # Gender data is not available; function should return None for all inputs

# Save cohort information
save_cohort_info('GSE119409', './preprocessed/Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Rectal_Cancer/trait_data/GSE119409.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3374350': [1, 52], 'GSM3374351': [None, 57], 'GSM3374352': [1, 65], 'GSM3374353': [0, 61], 'GSM3374354': [0, 62], 'GSM3374355': [0, 58], 'GSM3374356': [1, 63], 'GSM3374357': [0, 70], 'GSM3374358': [0, 61], 'GSM3374359': [0, 74], 'GSM3374360': [0, 72], 'GSM3374361': [0, 51], 'GSM3374362': [1, 70], 'GSM3374363': [0, 45], 'GSM3374364': [0, 77], 'GSM3374365': [0, 64], 'GSM3374366': [1, 66], 'GSM3374367': [0, 43], 'GSM3374368': [1, 65], 'GSM3374369': [1, 51], 'GSM3374370': [1, 66], 'GSM3374371': [0, 52], 'GSM3374372': [0, 39], 'GSM3374373': [0, 72], 'GSM3374374': [0, 71], 'GSM3374375': [0, 35], 'GSM3374376': [0, 61], 'GSM3374377': [0, 45], 'GSM3374378': [0, 42], 'GSM3374379': [0, 56], 'GSM3374380': [0, 40], 'GSM3374381': [0, 62], 'GSM3374382': [0, 67], 'GSM3374383': [None, 63], 'GSM3374384': [0, 70], 'GSM3374385': [None, 63], 'GSM3374386': [1, 42], 'GSM3374387': [0, 57], 'GSM3374388': [0, 40], 'GSM3374389': [None, 47], 'GSM3374390': [None, 69], 'GSM3374391': [None, 69], 'GSM3374392': [