In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Thyroid_Cancer/GSE103996'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA and gene expression profiling in human thyroid carcinomas and non-neoplastic thyroids [miRNA]"
!Series_summary	"We performed miRNA and gene expression profiling in a series of 30 thyroid carcinomas and 6 non-neoplastic thyroids."
!Series_overall_design	"MiRNA and gene expression profiles were established by microarray analysis in a series of 36 snap-frozen tissues using SurePrint G3 Human miRNA 8x60K microarrays (Agilent Technologies) and HumanHT-12 WG-DASL V4.0 R2 expression beadchip (Illumina), respectively. Tissue samples were obtained from Fondazione IRCCS Istituto Nazionale dei Tumori (Milan) and include various thyroid carcinoma histotypes: 20 papillary carcinomas (PTCs) consisting of different histological variants, 7 poorly differentiated thyroid carcinomas (PDTCs) and 3 lymph node metastases derived from PTC."
Sample Characteristics Dictionary:
{0: ['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'], 1: ['histology: PD

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # Based on the series summary, gene expression profiles are established

# Identify keys for each variable
trait_row = 0 if len(set(['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'])) > 1 else None
age_row = 2 if len(set(['age: 74', 'age: 72', 'age: 38', 'age: 50', 'age: 41', 'age: 51', 'age: 73', 'age: 52', 'age: 48', 'age: 59', 'age: 58', 'age: 39', 'age: 37', 'age: 33', 'age: 36', 'age: 70', 'age: 26', 'age: 46', 'age: 57', 'age: 44', 'age: 35', 'age: 42', 'age: 47', 'age: 61', 'age: 49', 'age: 56'])) > 1 else None
gender_row = 3 if len(set(['Sex: M', 'Sex: F'])) > 1 else None

# Define data type conversion functions
def convert_trait(value):
    try:
        val = value.split(": ")[1]
        return 1 if val == "Thyroid_carcinoma" else 0
    except:
        return None

def convert_age(value):
    try:
        return float(value.split(": ")[1])
    except:
        return None

def convert_gender(value):
    try:
        val = value.split(": ")[1]
        return 1 if val == "M" else 0
    except:
        return None

# Save cohort information
save_cohort_info('GSE103996', './preprocessed/Thyroid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Thyroid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Thyroid_Cancer/trait_data/GSE103996.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2787513': [1.0, 74.0, 1.0], 'GSM2787514': [1.0, 74.0, 1.0], 'GSM2787515': [1.0, 72.0, 0.0], 'GSM2787516': [1.0, 74.0, 0.0], 'GSM2787517': [1.0, 38.0, 0.0], 'GSM2787518': [1.0, 50.0, 0.0], 'GSM2787519': [1.0, 41.0, 1.0], 'GSM2787520': [0.0, 51.0, 0.0], 'GSM2787521': [1.0, 73.0, 1.0], 'GSM2787522': [1.0, 52.0, 0.0], 'GSM2787523': [1.0, 48.0, 0.0], 'GSM2787524': [0.0, 59.0, 1.0], 'GSM2787525': [1.0, 58.0, 1.0], 'GSM2787526': [1.0, 39.0, 0.0], 'GSM2787527': [1.0, 37.0, 0.0], 'GSM2787528': [1.0, 33.0, 0.0], 'GSM2787529': [1.0, 36.0, 1.0], 'GSM2787530': [1.0, 70.0, 0.0], 'GSM2787531': [1.0, 26.0, 0.0], 'GSM2787532': [1.0, 46.0, 1.0], 'GSM2787533': [0.0, 57.0, 0.0], 'GSM2787534': [1.0, 44.0, 0.0], 'GSM2787535': [1.0, 35.0, 1.0], 'GSM2787536': [0.0, 42.0, 1.0], 'GSM2787537': [1.0, 47.0, 0.0], 'GSM2787538': [1.0, 61.0, 0.0], 'GSM2787539': [1.0, 38.0, 0.0], 'GSM2787540': [1.0, 35.0, 1.0], 'GSM2787541': [1.0, 35.0, 0.0], 'GSM2787542': [1.0, 38.0, 0.0], 'GSM2787543': [0.0, 49.0, 1.0], 'GSM278

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['Blank', 'dmr_285', 'dmr_3', 'dmr_308', 'dmr_316', 'dmr_31a', 'dmr_6',
       'hsa-let-7a-3p', 'hsa-let-7a-5p', 'hsa-let-7b-3p', 'hsa-let-7b-5p',
       'hsa-let-7c', 'hsa-let-7c*_v16.0', 'hsa-let-7d-3p', 'hsa-let-7d-5p',
       'hsa-let-7e-3p', 'hsa-let-7e-5p', 'hsa-let-7f-1-3p', 'hsa-let-7f-2-3p',
       'hsa-let-7f-5p'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['Blank', 'dmr_285', 'dmr_3', 'dmr_308', 'dmr_316'], 'miRNA_ID': [nan, nan, nan, nan, nan], 'miRNA_ID.1': ['Blank', 'dmr_285', 'dmr_3', 'dmr_308', 'dmr_316']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the keys from the annotation data
identifier_key = 'ID'
gene_symbol_key = 'miRNA_ID.1'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with 'apply_gene_mapping' function
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Thyroid_Cancer/gene_data/GSE103996.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Thyroid_Cancer')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE103996', './preprocessed/Thyroid_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Thyroid_Cancer/GSE103996.csv'
    unbiased_merged_data.to_csv(csv_path)


No gene data in the dataframe
A new JSON file was created at: ./preprocessed/Thyroid_Cancer/cohort_info.json
