In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Endometrioid_Cancer/GSE108838'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A unique pair of microarray datasets for microRNA profiling: data with careful design (Dataset A)"
!Series_summary	"We set out to demonstrate the logistic feasibility of careful study design in microarray studies and the level of scientific benefits it can provide, in comparison with post-hoc data adjustment such as normalization, for preventing confounding handling effects and improving the accuracy and reproducibility of disease-relevant biomarker detection. Towards this end, we conducted a study of microRNA expression using endometroid endometrial cancer tumors (n=96) and serous ovarian cancer tumors (n=96) that were all primary, untreated, and collected in 2000-2012 at Memorial Sloan Kettering Cancer Center. The same set of tumor tissue samples were profiled twice using the Agilent microRNA microarrays with different study designs."
!Series_overall_design	"In the first study, arrays were assigned to tumor samples using blocked randomization an

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data
if 'gene expression' in "!Series_summary".lower():
    is_gene_available = True

# Check availability of 'Endometrioid_Cancer' data
if 0 in sample_characteristics_dict and len(sample_characteristics_dict[0]) > 1:
    trait_row = 0

# 'age' is not explicitly listed in the sample characteristics
age_row = None

# 'gender' is not explicitly listed in the sample characteristics
gender_row = None

# Convert function for 'Endometrioid_Cancer'
def convert_trait(value):
    if 'endometrioid endometrial cancer' in value:
        return 1
    elif 'serous ovarian cancer' in value:
        return 0
    else:
        return None

# 'age' conversion function (no data available, so it remains undefined)
def convert_age(value):
    return None

# 'gender' conversion function (no data available, so it remains undefined)
def convert_gender(value):
    return None

# Save cohort information
save_cohort_info('GSE108838', './preprocessed/Endometrioid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Endometrioid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Endometrioid_Cancer/trait_data/GSE108838.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2913987': [0], 'GSM2913988': [0], 'GSM2913989': [1], 'GSM2913990': [1], 'GSM2913991': [1], 'GSM2913992': [1], 'GSM2913993': [0], 'GSM2913994': [0], 'GSM2913995': [0], 'GSM2913996': [1], 'GSM2913997': [0], 'GSM2913998': [1], 'GSM2913999': [1], 'GSM2914000': [0], 'GSM2914001': [1], 'GSM2914002': [0], 'GSM2914003': [0], 'GSM2914004': [1], 'GSM2914005': [1], 'GSM2914006': [0], 'GSM2914007': [1], 'GSM2914008': [0], 'GSM2914009': [0], 'GSM2914010': [1], 'GSM2914011': [1], 'GSM2914012': [1], 'GSM2914013': [0], 'GSM2914014': [0], 'GSM2914015': [0], 'GSM2914016': [0], 'GSM2914017': [1], 'GSM2914018': [1], 'GSM2914019': [1], 'GSM2914020': [0], 'GSM2914021': [1], 'GSM2914022': [0], 'GSM2914023': [0], 'GSM2914024': [1], 'GSM2914025': [0], 'GSM2914026': [1], 'GSM2914027': [1], 'GSM2914028': [0], 'GSM2914029': [0], 'GSM2914030': [1], 'GSM2914031': [0], 'GSM2914032': [1], 'GSM2914033': [1], 'GSM2914034': [0], 'GSM2914035': [0], 'GSM2914036': [0], 'GSM2914037': [1], 'GSM2914038': [1], 'GSM2914039