In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Endometrioid_Cancer/GSE25405'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"EMca: serous, endometrioid, normal (miRNA)"
!Series_summary	"To determine the expression profiles of microRNAs (miRNAs) and to examine specific miRNA expression in endometrial serous adenocarcinoma in comparison with normal endometrial tissue and endometrial endometrioid adenocarcinoma.　Twenty-one serous adenocarcinoma tissues, 20 endometrioid adenocarcinoma tissues, and 7 normal endometrial tissues were enrolled.　miRNA expression profiles were examined using miRNA microarray."
!Series_overall_design	"After obtaining informed consent, 21 serous adenocarcinoma tissues, 20 endometrioid adenocarcinoma tissues, and 7 normal endometrial tissue　were retrieved from the surgical pathology files at Tohoku University Hospital (Sendai, Japan). The research protocol was approved by the Ethics Committee at Tohoku University Graduate School of Medicine (Sendai, Japan). All specimens were obtained from surgery that was performed between January 2001 and December

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check if gene expression data is available
is_gene_available = False  # Based on background info, this dataset deals with miRNA data, not suitable for our study.

# Step 2.1: Check the availability of each variable in the characteristics dictionary
sample_characteristics = {0: ['tissue: endometrial serous adenocarcinoma', 'tissue: normal endometrial tissue', 'tissue: endometrial endometrioid adenocarcinomas']}

# Check for 'Endometrioid_Cancer'
trait_values = sample_characteristics.get(0, [])
if len(set(trait_values)) > 1:  # Varied data available
    trait_row = 0
else:
    trait_row = None

# 'age' and 'gender' are not mentioned in the sample characteristics dictionary
age_row = None
gender_row = None

# Step 2.3: Define conversion functions

def convert_trait(value):
    value = value.split(':')[1].strip()
    return 1 if 'endometrioid' in value.lower() else 0 if 'normal' in value.lower() or 'serous' in value.lower() else None

def convert_age(value):
    return None  # No age data provided

def convert_gender(value):
    return None  # No gender data provided

convert_trait = convert_trait if trait_row is not None else None
convert_age = convert_age if age_row is not None else None
convert_gender = convert_gender if gender_row is not None else None

# Step 3: Save Metadata
save_cohort_info('GSE25405', './preprocessed/Endometrioid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Extract Clinical Feature Data if available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Endometrioid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Endometrioid_Cancer/trait_data/GSE25405.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Endometrioid_Cancer/cohort_info.json
{'GSM623738': [0], 'GSM623739': [0], 'GSM623740': [0], 'GSM623817': [0], 'GSM623818': [0], 'GSM623819': [0], 'GSM623820': [0], 'GSM623821': [0], 'GSM623822': [0], 'GSM623823': [0], 'GSM623824': [0], 'GSM623825': [0], 'GSM623826': [0], 'GSM623827': [0], 'GSM623828': [0], 'GSM623829': [0], 'GSM623830': [0], 'GSM623831': [0], 'GSM623832': [0], 'GSM623851': [0], 'GSM623852': [0], 'GSM623853': [0], 'GSM623854': [0], 'GSM623855': [0], 'GSM623856': [1], 'GSM623857': [1], 'GSM623858': [1], 'GSM623859': [1], 'GSM623860': [1], 'GSM623861': [1], 'GSM623862': [1], 'GSM623863': [1], 'GSM623864': [1], 'GSM623865': [1], 'GSM623866': [1], 'GSM623867': [1], 'GSM623868': [1], 'GSM623869': [1], 'GSM623870': [1], 'GSM623871': [1], 'GSM623872': [1], 'GSM623873': [1], 'GSM623874': [1], 'GSM623875': [0], 'GSM623876': [0], 'GSM623877': [0], 'GSM623878': [0], 'GSM623881': [1]}
