In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Bile_Duct_Cancer/GSE212211'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Development of blood biomarkers that distinguish pancreatic cancer from biliary tract cancer"
!Series_summary	"We have developed a method for diagnosing pancreatic cancer and bile duct cancer based on miRNA expression information in the circulating blood."
!Series_summary	"2565 miRNAs in 426 serum samples were analyzed."
!Series_overall_design	"The design of this series was constructed in 257 hepatocellular carcinoma (HCC) patients and 41 cholangiocarcinoma carcinoma (ICC) patients and two gall bladder cancer patients."
Sample Characteristics Dictionary:
{0: ['disease/diagnosis: hepatocellular carcinoma'], 1: ['sample collection time: after operation beyand 14 days', 'sample collection time: before operation', 'sample collection time: after operation within 14 days', 'sample collection time: before operation at 2nd recurrence'], 2: ['molecule subtype: miRNA']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Determine if gene expression data is available
gene_expression_keywords = ["gene expression", "transcriptome"]
summary_content = """
!Series_summary "We have developed a method for diagnosing pancreatic cancer and bile duct cancer based on miRNA expression information in the circulating blood."
!Series_summary "2565 miRNAs in 426 serum samples were analyzed."
"""
if any(keyword in summary_content for keyword in gene_expression_keywords):
    is_gene_available = True

# Step 2: Find the rows for each variable from the Sample Characteristics Dictionary and create conversion functions
sample_characteristics_dict = {
    0: ['disease/diagnosis: Cholangiocellular carcinoma', 'disease/diagnosis: hepatocellular carcinoma', 'disease/diagnosis: Gall bladder cancer'],
    1: ['sample collection time: before operation', 'sample collection time: before operation at 2nd recurrence'],
    2: ['molecule subtype: miRNA']
}

# Finding rows for 'Bile_Duct_Cancer', 'age', and 'gender'
# Based on disease types provided, we might infer 'Bile_Duct_Cancer' from it
disease_information = sample_characteristics_dict.get(0, [])
unique_diseases = set(disease_information)

# Check if it's not a constant feature
if len(unique_diseases) > 1:
    trait_row = 0

# We could not infer 'age' and 'gender' from the available information
age_row = None
gender_row = None

# Step 2.3: Data Type Conversion functions
# Conversion for trait 'Bile_Duct_Cancer'
def convert_trait(value):
    _, val = value.split(': ')
    if val == 'Cholangiocellular carcinoma' or val == 'Gall bladder cancer':
        return 1
    elif val == 'hepatocellular carcinoma':
        return 0
    else:
        return None

# Age and Gender conversion functions would not be needed as there are no corresponding rows
def convert_age(value):
    return None

def convert_gender(value):
    return None

# Step 3: Save Metadata
save_cohort_info('GSE212211', './preprocessed/Bile_Duct_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction (only if trait_row is defined)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Bile_Duct_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Bile_Duct_Cancer/trait_data/GSE212211.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6513597': [0], 'GSM6513598': [0], 'GSM6513599': [0], 'GSM6513600': [0], 'GSM6513601': [0], 'GSM6513602': [0], 'GSM6513603': [0], 'GSM6513604': [0], 'GSM6513605': [0], 'GSM6513606': [0], 'GSM6513607': [0], 'GSM6513608': [0], 'GSM6513609': [0], 'GSM6513610': [0], 'GSM6513611': [0], 'GSM6513612': [0], 'GSM6513613': [0], 'GSM6513614': [0], 'GSM6513615': [0], 'GSM6513616': [0], 'GSM6513617': [0], 'GSM6513618': [0], 'GSM6513619': [0], 'GSM6513620': [0], 'GSM6513621': [0], 'GSM6513622': [0], 'GSM6513623': [0], 'GSM6513624': [0], 'GSM6513625': [0], 'GSM6513626': [0], 'GSM6513627': [0], 'GSM6513628': [0], 'GSM6513629': [0], 'GSM6513630': [0], 'GSM6513631': [0], 'GSM6513632': [0], 'GSM6513633': [0], 'GSM6513634': [0], 'GSM6513635': [0], 'GSM6513636': [0], 'GSM6513637': [0], 'GSM6513638': [0], 'GSM6513639': [0], 'GSM6513640': [0], 'GSM6513641': [0], 'GSM6513642': [0], 'GSM6513643': [0], 'GSM6513644': [0], 'GSM6513645': [0], 'GSM6513646': [0], 'GSM6513647': [0], 'GSM6513648': [0], 'GSM6513649