In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Thyroid_Cancer/GSE191117'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA Expression Pattern in Papillary Thyroid Cancer"
!Series_summary	"The aim of this study was to conduct a comparison of tissue miRNA in thyroid cancer (papillary (PTC), follicular (FTC), and microcancer (PTMC)) and  healthy thyroid tissue (Control). The expression level of 798 miRNAs using NanoString technology was examined. ROC curve analysis, and logistic regression modeling were performed. Gene ontology (GO), canonical pathways analysis were used to explore the biological functions of the miRNA target genes."
!Series_summary	"The study revealed that 10 miRNAs were deregulated in samples of patients with PTC. Pathway analysis showed that miRNA target genes were mainly significantly enriched in endocrine resistance, EGFR tyrosine kinase inhibitor resistance, and pathways in cancer. ROC analysis demonstrated that miR-146-5p, miR-551b-3p, and miR-222-3p can be introduced as a diagnostic tool for PTC (AUC=0.770; 0.740; 0.720; respectively). Vali

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking dataset type
is_gene_available = False  # as description indicates miRNA data.

# Define sample_characteristics based on the input from STEP 1
sample_characteristics = {
    0: ['disease state: PTC', 'disease state: Control', 'disease state: FTC', 'disease state: PTMC'],
    1: ['tnm staging system: pT1m', 'tnm staging system: pT2b', 'tissue: control (normal thyroid)', 'tnm staging system: pT1b', 'tnm staging system: pT1a', 'tnm staging system: no data', 'tnm staging system: pT4', 'tnm staging system: pT1', 'tnm staging system: pT2', 'tnm staging system: pT2a', 'tnm staging system: pT3'],
    2: ['tumor size: 7', 'tumor size: 14', None, 'tumor size: 5', 'tumor size: 9', 'tumor size: no data', 'tumor size: 8', 'tumor size: 10', 'tumor size: 45', 'tumor size: 3', 'tumor size: 11', 'tumor size: 20', 'tumor size: 15', 'tumor size: 6', 'tumor size: 19', 'tumor size: 1', 'tumor size: 30', 'tumor size: 13', 'tumor size: 18', 'tumor size: 25'],
    3: ['multifocalization: 1-presence', None, 'multifocalization: 0-absence', 'multifocalization: no data', 'lymph node metastasis: 1-presence'],
    4: ['angioinvasion: 1-presence', None, 'angioinvasion: 0-absence', 'angioinvasion: no data', 'tissue: PTC'],
    5: ['lymph node metastasis: 0-absence', None, 'lymph node metastasis: 1-presence'],
    6: ['tissue: PTC', None, 'tissue: FTC', 'tissue: PTMC']
}

# Determine the availability of the trait
trait_keys = ['PTC', 'FTC', 'PTMC', 'Control']
if any(trait in ','.join(str(v)) for k, v in sample_characteristics.items() for trait in trait_keys):
    trait_row = 0

def convert_trait(value):
    try:
        trait = value.split(':')[1].strip()
        if trait in ['PTC', 'FTC', 'PTMC']:
            return 1
        elif trait == 'Control':
            return 0
        else:
            return None
    except:
        return None

# Save cohort info
save_cohort_info('GSE191117', './preprocessed/Thyroid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Extract and save clinical feature data if trait_row is not None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Thyroid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Thyroid_Cancer/trait_data/GSE191117.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
