In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Thyroid_Cancer/GSE104005'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA and gene expression profiling in human thyroid carcinomas and non-neoplastic thyroids [RNA]"
!Series_summary	"We performed miRNA and gene expression profiling in a series of 30 thyroid carcinomas and 6 non-neoplastic thyroids."
!Series_overall_design	"MiRNA and gene expression profiles were established by microarray analysis in a series of 36 snap-frozen tissues using SurePrint G3 Human miRNA 8x60K microarrays (Agilent Technologies) and HumanHT-12 WG-DASL V4.0 R2 expression beadchip (Illumina), respectively. Tissue samples were obtained from Fondazione IRCCS Istituto Nazionale dei Tumori (Milan) and include various thyroid carcinoma histotypes: 20 papillary carcinomas (PTCs) consisting of different histological variants, 7 poorly differentiated thyroid carcinomas (PDTCs) and 3 lymph node metastases derived from PTC."
Sample Characteristics Dictionary:
{0: ['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'], 1: ['histology: PDTC

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True

# Identify the keys for the variables
for key, values in {0: ['disease: Thyroid_carcinoma', 'disease: Non-neoplastic_thyroid'], 
                    1: ['histology: PDTC', 'histology: PDTC+ATC', 'histology: PTC', 'histology: Non-neoplastic_thyroid', 'histology: PDTC+PTC', 'histology: PTC_lymph_node_metastasis', 'histology: PTC+PDTC'], 
                    2: ['age: 74', 'age: 67', 'age: 72', 'age: 38', 'age: 50', 'age: 41', 'age: 51', 'age: 73', 'age: 52', 'age: 48', 'age: 59', 'age: 58', 'age: 39', 'age: 37', 'age: 33', 'age: 36', 'age: 70', 'age: 26', 'age: 46', 'age: 57', 'age: 44', 'age: 35', 'age: 42', 'age: 61', 'age: 49'], 
                    3: ['Sex: M', 'Sex: F']}.items():
    if all(item.startswith('disease: ') for item in values):
        trait_row = key
    elif all(item.startswith('age: ') for item in values):
        age_row = key
    elif all(item.startswith('Sex: ') for item in values):
        gender_row = key

# Define conversion functions
def convert_trait(value):
    parts = value.split(': ')
    if len(parts) == 2:
        return 1 if parts[1] == "Thyroid_carcinoma" else 0
    return None

def convert_age(value):
    parts = value.split(': ')
    if len(parts) == 2:
        try:
            return float(parts[1])
        except ValueError:
            return None
    return None

def convert_gender(value):
    parts = value.split(': ')
    if len(parts) == 2:
        return 1 if parts[1] == "M" else 0
    return None

# Save cohort information
save_cohort_info('GSE104005', './preprocessed/Thyroid_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Thyroid_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Thyroid_Cancer/trait_data/GSE104005.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2787612': [1.0, 74.0, 1.0], 'GSM2787613': [1.0, 74.0, 1.0], 'GSM2787614': [1.0, 67.0, 0.0], 'GSM2787615': [1.0, 72.0, 0.0], 'GSM2787616': [1.0, 74.0, 0.0], 'GSM2787617': [1.0, 38.0, 0.0], 'GSM2787618': [1.0, 50.0, 0.0], 'GSM2787619': [1.0, 41.0, 1.0], 'GSM2787620': [0.0, 51.0, 0.0], 'GSM2787621': [1.0, 73.0, 1.0], 'GSM2787622': [1.0, 52.0, 0.0], 'GSM2787623': [1.0, 48.0, 0.0], 'GSM2787624': [0.0, 59.0, 1.0], 'GSM2787625': [1.0, 58.0, 1.0], 'GSM2787626': [1.0, 39.0, 0.0], 'GSM2787627': [1.0, 37.0, 0.0], 'GSM2787628': [1.0, 33.0, 0.0], 'GSM2787629': [1.0, 36.0, 1.0], 'GSM2787630': [1.0, 70.0, 0.0], 'GSM2787631': [1.0, 26.0, 0.0], 'GSM2787632': [1.0, 46.0, 1.0], 'GSM2787633': [0.0, 57.0, 0.0], 'GSM2787634': [1.0, 44.0, 0.0], 'GSM2787635': [1.0, 44.0, 0.0], 'GSM2787636': [1.0, 35.0, 1.0], 'GSM2787637': [0.0, 42.0, 1.0], 'GSM2787638': [1.0, 61.0, 0.0], 'GSM2787639': [1.0, 38.0, 0.0], 'GSM2787640': [1.0, 35.0, 1.0], 'GSM2787641': [1.0, 35.0, 0.0], 'GSM2787642': [1.0, 38.0, 0.0], 'GSM278

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['ILMN_1343291', 'ILMN_1651209', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651235', 'ILMN_1651236', 'ILMN_1651237', 'ILMN_1651238',
       'ILMN_1651254', 'ILMN_1651260', 'ILMN_1651262', 'ILMN_1651268',
       'ILMN_1651278', 'ILMN_1651282', 'ILMN_1651285', 'ILMN_1651286',
       'ILMN_1651292', 'ILMN_1651303', 'ILMN_1651309', 'ILMN_1651315'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_3166687', 'ILMN_3165566', 'ILMN_3164811', 'ILMN_3165363', 'ILMN_3166511'], 'Transcript': ['ILMN_333737', 'ILMN_333646', 'ILMN_333584', 'ILMN_333628', 'ILMN_333719'], 'Species': ['ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls'], 'Source': ['ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls'], 'Search_Key': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'ILMN_Gene': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Source_Reference_ID': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], 'Accession': ['DQ516750', 'DQ883654', 'DQ668364', 'DQ516785', 'DQ854995'], 'Symbol': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Protein_Product': [nan, nan, nan, nan, nan], 'Array_Addres

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Define the identifier_key and gene_symbol_key based on previous results
identifier_key = 'ID'
gene_symbol_key = 'Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Thyroid_Cancer/gene_data/GSE104005.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Thyroid_Cancer')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE104005', './preprocessed/Thyroid_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Thyroid_Cancer/GSE104005.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Thyroid_Cancer', the least common label is '0.0' with 5 occurrences. This represents 14.71% of the dataset.
The distribution of the feature 'Thyroid_Cancer' in this dataset is fine.

Quartiles for 'Age':
  25%: 38.0
  50% (Median): 48.5
  75%: 58.75
Min: 26.0
Max: 74.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 12 occurrences. This represents 35.29% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

