In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Endometrioid_Cancer/GSE94523'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Tamoxifen-associated endometrial tumors expose differential enhancer activity for Estrogen Receptor alpha [Microarray Expression]"
!Series_summary	"Tamoxifen, a small molecule inhibitor that binds the Estrogen Receptor alpha (ERα), blocks breast cancer progression while increasing the risk for endometrial cancer. In this study, we assessed genome-wide ERα-chromatin interactions in surgical specimens of endometrial tumors from patients who were previously treated for breast cancer with tamoxifen, and endometrial tumors from patients who were treated without tamoxifen. We compared ERα and signal at differential ERα sites in endometrial tumors of nine patients who received tamoxifen with endometrial tumors with six patients who never used tamoxifen. In addition, we performed H3K27ac (a marker for activity) ChIPs on the above mentioned endometrial tumors, and assed this signal at differential ERα sites. Compared to endometrial tumors of non-users, tam

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check if gene expression data is available
is_gene_available = True  # Determined from the title "Microarray Expression"

# Step 2.1: Check data availability in the sample characteristics dictionary

# Sample Characteristics Dictionary from Output of STEP 1:
# {0: ['tissue: endometrioid adenocarcinoma']}
sample_characteristics = {
    0: ['tissue: endometrioid adenocarcinoma']
}

# Check for 'Endometrioid_Cancer'
trait_row = 0 if len(set(sample_characteristics[0])) > 1 else None

# 'age' and 'gender' data are not available in the sample characteristics dictionary provided
age_row = gender_row = None

# Step 2.3: Data Type Conversion Functions
def convert_trait(value):
    try:
        return 1 if "endometrioid adenocarcinoma" in value.lower() else 0
    except:
        return None

convert_age = convert_gender = None  # Not applicable as the data isn't available

# Step 3: Save cohort information
save_cohort_info('GSE94523', './preprocessed/Endometrioid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Endometrioid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Endometrioid_Cancer/trait_data/GSE94523.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
