In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Uterine_Corpus_Endometrial_Carcinoma/GSE16680'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"arrayCGH profiles of endometrial cancer with and without prior prolonged tamoxifen treatment for primary breast cancer"
!Series_summary	"Full title: comparison of the genomic (arrayCGH) profiles of endometrial cancer with and without prior prolonged tamoxifen treatment for primary breast cancer"
!Series_summary	""
!Series_summary	"Purpose: Tamoxifen has been a very effective treatment for breast cancer for several decades, however, at the same time increases the risk of endometrial cancer, especially after prolonged exposure. In addition, tamoxifen has been associated with a higher proportion of unfavorable uterine tumor subtypes (carcinosarcomas and serous adenocarcinomas) with worse survival. We investigated whether endometrial tumors, which developed after prolonged tamoxifen treatment for breast cancer, are genetically different from endometrial tumors without preceding tamoxifen exposure."
!Series_summary	""
!Series_summary	"Experimental desi

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check gene expression data availability
is_gene_available = False  # Based on the background information, this is likely arrayCGH data, not gene expression data

# Identifying keys for variables
sample_characteristics = {
    0: ['gender: female'],
    1: ['tumor: endometrial'],
    2: ['histology: Endometrioid', 'histology: Serous adenocarcinoma', 'histology: carcinosarcoma']
}

# 'Uterine_Corpus_Endometrial_Carcinoma' variable
trait_row = 1 if len(set(sample_characteristics[1])) > 1 else None

# 'age' variable (Not mentioned in the characteristics, so setting to None)
age_row = None

# 'gender' variable
gender_row = 0 if len(set(sample_characteristics[0])) > 1 else None

# Define conversion functions for each variable
def convert_trait(value: str):
    if 'tumor: endometrial' in value:
        return 1
    return None

def convert_age(value: str):
    try:
        return int(value.split(":")[1].strip())
    except (ValueError, IndexError):
        return None

def convert_gender(value: str):
    gender = value.split(":")[1].strip().lower()
    if gender == 'female':
        return 0
    elif gender == 'male':
        return 1
    return None

# Save cohort information
save_cohort_info('GSE16680', './preprocessed/Uterine_Corpus_Endometrial_Carcinoma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Uterine_Corpus_Endometrial_Carcinoma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Uterine_Corpus_Endometrial_Carcinoma/trait_data/GSE16680.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
