In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Thyroid_Cancer/GSE104006'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA and gene expression profiling in human thyroid carcinomas and non-neoplastic thyroids"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series"
Sample Characteristics Dictionary:
{0: ['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'], 1: ['histology: PDTC', 'histology: PDTC+ATC', 'histology: PTC', 'histology: Non-neoplastic_thyroid', 'histology: PDTC+PTC', 'histology: PTC_lymph_node_metastasis', 'histology: PTC+PDTC'], 2: ['age: 74', 'age: 67', 'age: 72', 'age: 38', 'age: 50', 'age: 41', 'age: 51', 'age: 73', 'age: 52', 'age: 48', 'age: 59', 'age: 58', 'age: 39', 'age: 37', 'age: 33', 'age: 36', 'age: 70', 'age: 26', 'age: 46', 'age: 57', 'age: 44', 'age: 35', 'age: 42', 'age: 61', 'age: 49'], 3: ['Sex: M', 'Sex: F']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking if gene expression data is likely available
background_summary = "!Series_overall_design\t\"Refer to individual Series\""
if "gene expression profiling" in background_summary.lower():
    is_gene_available = True

# Finding keys for 'Thyroid_Cancer', 'age', and 'gender'
sample_characteristics = {
    0: ['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'],
    1: ['histology: PDTC', 'histology: PTC', 'histology: Non-neoplastic_thyroid', 'histology: PDTC+PTC', 'histology: PTC+PDTC', 'histology: PTC_lymph_node_metastasis'],
    2: ['age: 74', 'age: 72', 'age: 38', 'age: 50', 'age: 41', 'age: 51', 'age: 73', 'age: 52', 'age: 48', 'age: 59', 'age: 58', 'age: 39', 'age: 37', 'age: 33', 'age: 36', 'age: 70', 'age: 26', 'age: 46', 'age: 57', 'age: 44', 'age: 35', 'age: 42', 'age: 47', 'age: 61', 'age: 49', 'age: 56'],
    3: ['Sex: M', 'Sex: F']
}

# Determine the keys for each variable
if len(set(sample_characteristics[0])) > 1:
    trait_row = 0

if len(set(sample_characteristics[2])) > 1:
    age_row = 2

if len(set(sample_characteristics[3])) > 1:
    gender_row = 3

# Define the data type conversion functions
def convert_trait(value):
    raw_value = value.split(': ')[-1]
    if raw_value == 'Thyroid_carcinoma':
        return 1
    elif raw_value == 'Non-neoplastic_thyroid':
        return 0
    return None

def convert_age(value):
    try:
        return int(value.split(': ')[-1])
    except ValueError:
        return None

def convert_gender(value):
    raw_value = value.split(': ')[-1]
    if raw_value == 'M':
        return 1
    elif raw_value == 'F':
        return 0
    return None

# Save cohort information
save_cohort_info('GSE104006', './preprocessed/Thyroid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Thyroid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Thyroid_Cancer/trait_data/GSE104006.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2787612': [1, 74, 1], 'GSM2787613': [1, 74, 1], 'GSM2787614': [1, 67, 0], 'GSM2787615': [1, 72, 0], 'GSM2787616': [1, 74, 0], 'GSM2787617': [1, 38, 0], 'GSM2787618': [1, 50, 0], 'GSM2787619': [1, 41, 1], 'GSM2787620': [0, 51, 0], 'GSM2787621': [1, 73, 1], 'GSM2787622': [1, 52, 0], 'GSM2787623': [1, 48, 0], 'GSM2787624': [0, 59, 1], 'GSM2787625': [1, 58, 1], 'GSM2787626': [1, 39, 0], 'GSM2787627': [1, 37, 0], 'GSM2787628': [1, 33, 0], 'GSM2787629': [1, 36, 1], 'GSM2787630': [1, 70, 0], 'GSM2787631': [1, 26, 0], 'GSM2787632': [1, 46, 1], 'GSM2787633': [0, 57, 0], 'GSM2787634': [1, 44, 0], 'GSM2787635': [1, 44, 0], 'GSM2787636': [1, 35, 1], 'GSM2787637': [0, 42, 1], 'GSM2787638': [1, 61, 0], 'GSM2787639': [1, 38, 0], 'GSM2787640': [1, 35, 1], 'GSM2787641': [1, 35, 0], 'GSM2787642': [1, 38, 0], 'GSM2787643': [0, 49, 1], 'GSM2787644': [1, 52, 0], 'GSM2787645': [1, 51, 0]}
