In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Uterine_Carcinosarcoma/GSE68950'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"caArray_golub-00327: Sanger cell line Affymetrix gene expression project"
!Series_summary	"The microarray gene expression pattern was studied using 798 different cancer cell lines. The cancer cell lines are obtained from different centers. Annotation information were provided in the supplementary file."
!Series_overall_design	"golub-00327"
!Series_overall_design	"Assay Type: Gene Expression"
!Series_overall_design	"Provider: Affymetrix"
!Series_overall_design	"Array Designs: HT_HG-U133A"
!Series_overall_design	"Organism: Homo sapiens (ncbitax)"
!Series_overall_design	"Tissue Sites: leukemia, Urinary tract, Lung, BiliaryTract, Autonomic Ganglion, Thyroid gland, Stomach, Breast, Pancreas, Head and Neck, Lymphoma, Colorectal, Placenta, Liver, Brain, Bone, pleura, Skin, endometrium, Ovary, cervix, Oesophagus, Connective and Soft Tissue, Muscle, Kidney, Prostate, Adrenal Gland, Eye, Testis, Smooth Muscle Tissue, Vulva, Unknow"
!Series_overall_design	"M

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
is_gene_available = True  # The dataset mentions Affymetrix gene expression project

# Variable Availability and Data Type Conversion
# Step 2.1 Data Availability
# 'Uterine_Carcinosarcoma'
for key, values in {
    1: ['disease state: L2 Acute Lymphoblastic Leukemia', 'disease state: NS Acute Lymphoblastic Leukemia', 
        'disease state: carcinoma', 'disease state: adenocarcinoma', 'disease state: transitional cell carcinoma', 
        'disease state: clear cell renal cell carcinoma', 'disease state: anaplastic carcinoma', 
        'disease state: glioblastoma multiforme', 'disease state: malignant melanoma', 'disease state: rhabdomyosarcoma', 
        'disease state: mucoepidermoid carcinoma', 'disease state: squamous cell carcinoma', 'disease state: renal cell carcinoma', 
        'disease state: neuroblastoma', 'disease state: Acute Lymphoblastic Leukemia', 'disease state: M5 acute myeloid leukemia', 
        'disease state: plasma cell myeloma', 'disease state: L1 Acute T-Cell Lymphoblastic Leukemia', 'disease state: astrocytoma', 
        'disease state: B Acute Lymphoblastic Leukemia', 'disease state: B cell lymphoma unspecified', 
        'disease state: papillary carcinoma', 'disease state: papillary transitional cell carcinoma', 'disease state: Burkitt lymphoma', 
        'disease state: hairy cell leukemia', 'disease state: hyperplasia', 'disease state: papillary ductal carcinoma', 
        'disease state: blast phase chronic myeloid leukemia', 'disease state: hepatocellular carcinoma', 
        'disease state: Adult T-Cell Leukemia/Lymphoma']
}.items():
    if any(['carcinosarcoma' in v.lower() for v in values]):
        trait_row = key
        break
else:
    trait_row = None

# 'age' - Age information is not explicitly provided in the dataset
age_row = None

# 'gender' - Gender information is not explicitly provided in the dataset
gender_row = None

# Step 2.3 Data Type Conversion
def convert_trait(value):
    if 'carcinosarcoma' in value.lower():
        return 1
    return 0

# Save Metadata
save_cohort_info('GSE68950', './preprocessed/Uterine_Carcinosarcoma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Uterine_Carcinosarcoma', trait_row, convert_trait, age_row, None, gender_row, None)
    csv_path = './preprocessed/Uterine_Carcinosarcoma/trait_data/GSE68950.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
