In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hypothyroidism/GSE75678'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene Expression of Mexican Patients with Breast Cancer"
!Series_summary	"Gene expression of tumor sample of mexican patients with breast cancer."
!Series_summary	"Samples obtained from the Hospital San Jose Tec de Monterrey."
!Series_overall_design	"The experiments were with one color per patient, gene expression profile is from a tumor sample of mexican patients with breast cancer."
Sample Characteristics Dictionary:
{0: ['tissue: Tumor Sample of Breast Cancer'], 1: ['gender: Female'], 2: ['rna ng/ul: 1083', 'rna ng/ul: 343', 'rna ng/ul: 111', 'rna ng/ul: 307', 'rna ng/ul: 401', 'rna ng/ul: 475', 'rna ng/ul: 728', 'rna ng/ul: 143.6', 'rna ng/ul: 224.7', 'rna ng/ul: 1458.3', 'rna ng/ul: 164', 'rna ng/ul: 370.2', 'rna ng/ul: 419.5', 'rna ng/ul: 693.6', 'rna ng/ul: 291.4', 'rna ng/ul: 1566.4', 'rna ng/ul: 69', 'rna ng/ul: 625.4', 'rna ng/ul: 151.6', 'rna ng/ul: 127.7', 'rna ng/ul: 1116.8', 'rna ng/ul: 333.9', 'rna ng/ul: 182.1', 'rna ng/ul: 437.4', 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True

# Full suitable sample_characteristics dictionary instead of ellipsis
sample_characteristics = {
    0: ['tissue: Tumor Sample of Breast Cancer'], 
    1: ['gender: Female'], 
    2: ['rna ng/ul: 1083', 'rna ng/ul: 343', 'rna ng/ul: 111', 'rna ng/ul: 307', 'rna ng/ul: 401', 'rna ng/ul: 475', 'rna ng/ul: 728', 'rna ng/ul: 143.6', 'rna ng/ul: 224.7', 'rna ng/ul: 1458.3', 'rna ng/ul: 164', 'rna ng/ul: 370.2', 'rna ng/ul: 419.5', 'rna ng/ul: 693.6', 'rna ng/ul: 291.4', 'rna ng/ul: 1566.4', 'rna ng/ul: 69', 'rna ng/ul: 625.4', 'rna ng/ul: 151.6', 'rna ng/ul: 127.7', 'rna ng/ul: 1116.8', 'rna ng/ul: 333.9', 'rna ng/ul: 182.1', 'rna ng/ul: 437.4', 'rna ng/ul: 439', 'rna ng/ul: 178.2', 'rna ng/ul: 1365', 'rna ng/ul: 670', 'rna ng/ul: 840.6', 'rna ng/ul: 725'], 
    21: ['personal pathological hystory: Neg', 'personal pathological hystory: Rheumatoid Arthritis', 'personal pathological hystory: Hypertension', 'personal pathological hystory: Apendicitis', 'personal pathological hystory: Hypertension and Diabetes', 'personal pathological hystory: Hypothyroidism', 'personal pathological hystory: Diabetes', 'personal pathological hystory: Ocular Surgery', 'personal pathological hystory: 3 C sections', 'personal pathological hystory: 0', 'personal pathological hystory: C section', 'personal pathological hystory: Hysterechtomy', 'personal pathological hystory: Dyslipidemia', 'personal pathological hystory: Hypertension and Rheumatoid Artritis', 'personal pathological hystory: Knee Surgery', 'personal pathological hystory: Venous insuficiency'], 
    19: ['age at diagnosis: 45', 'age at diagnosis: 41', 'age at diagnosis: 59', 'age at diagnosis: 57', 'age at diagnosis: 42', 'age at diagnosis: 49', 'age at diagnosis: 54', 'age at diagnosis: 31', 'age at diagnosis: 70', 'age at diagnosis: 44', 'age at diagnosis: 50', 'age at diagnosis: 56', 'age at diagnosis: 51', 'age at diagnosis: 58', 'age at diagnosis: 55', 'age at diagnosis: 71', 'age at diagnosis: 40', 'age at diagnosis: 62', 'age at diagnosis: 87', 'age at diagnosis: 36', 'age at diagnosis: 43', 'age at diagnosis: 48', 'age at diagnosis: 66', 'age at diagnosis: 53', 'age at diagnosis: 35', 'age at diagnosis: 68', 'age at diagnosis: 46']
}

# Set the trait_row based on the sample characteristics dictionary
if 21 in sample_characteristics:
    unique_traits = set([entry.split(':')[1].strip() for entry in sample_characteristics[21]])
    if len(unique_traits) > 1:
        trait_row = 21

# Set the age_row based on the sample characteristics dictionary
if 19 in sample_characteristics:
    unique_ages = set([entry.split(':')[1].strip() for entry in sample_characteristics[19]])
    if len(unique_ages) > 1:
        age_row = 19
        
# Check for gender_row
# Gender seems to be constant in this dataset hence should be skipped as per instructions
gender_row = None

# Define the data type conversion functions
def convert_trait(value):
    return 1 if 'Hypothyroidism' in value else 0

def convert_age(value):
    try:
        return int(value.split(':')[1].strip())
    except (ValueError, IndexError):
        return None
        
save_cohort_info('GSE75678', './preprocessed/Hypothyroidism/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hypothyroidism', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hypothyroidism/trait_data/GSE75678.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1963528': [0, 45], 'GSM1963529': [0, 41], 'GSM1963530': [0, 59], 'GSM1963531': [0, 57], 'GSM1963532': [0, 42], 'GSM1963533': [0, 49], 'GSM1963534': [0, 59], 'GSM1963535': [0, 54], 'GSM1963536': [0, 54], 'GSM1963537': [0, 31], 'GSM1963538': [0, 70], 'GSM1963539': [0, 44], 'GSM1963540': [0, 50], 'GSM1963541': [0, 42], 'GSM1963542': [0, 56], 'GSM1963543': [1, 51], 'GSM1963544': [0, 58], 'GSM1963545': [0, 55], 'GSM1963546': [0, 71], 'GSM1963547': [0, 42], 'GSM1963548': [0, 41], 'GSM1963549': [0, 40], 'GSM1963550': [0, 57], 'GSM1963551': [0, 62], 'GSM1963552': [0, 87], 'GSM1963553': [0, 36], 'GSM1963554': [0, 50], 'GSM1963555': [0, 45], 'GSM1963556': [0, 43], 'GSM1963557': [0, 42], 'GSM1963558': [0, 43], 'GSM1963559': [0, 44], 'GSM1963560': [0, 43], 'GSM1963561': [0, 48], 'GSM1963562': [0, 45], 'GSM1963563': [0, 51], 'GSM1963564': [0, 56], 'GSM1963565': [0, 57], 'GSM1963566': [0, 41], 'GSM1963567': [0, 48], 'GSM1963568': [0, 66], 'GSM1963569': [0, 53], 'GSM1963570': [0, 36], 'GSM196357

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1', '2', '3', '4', '5'], 'COL': ['266', '266', '266', '266', '266'], 'ROW': [170.0, 168.0, 166.0, 164.0, 162.0], 'NAME': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner'], 'SPOT_ID': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner'], 'CONTROL_TYPE': ['pos', 'pos', 'pos', 'pos', 'pos'], 'REFSEQ': [nan, nan, nan, nan, nan], 'GB_ACC': [nan, nan, nan, nan, nan], 'GENE': [nan, nan, nan, nan, nan], 'GENE_SYMBOL': [nan, nan, nan, nan, nan], 'GENE_NAME': [nan, nan, nan, nan, nan], 'UNIGENE_ID': [nan, nan, nan, nan, nan], 'ENSEMBL_ID': [nan, nan, nan, nan, nan], 'TIGR_ID': [nan, nan, nan, nan, nan], 'ACCESSION_STRING': [nan, nan, nan, nan, nan], 'CHROMOSOMAL_LOCATION': [nan, nan, nan, nan, nan], 'CYTOBAND': [nan, nan, nan, nan, nan], 'DESCRIPTION': [nan, nan, nan, nan, nan], 'GO_ID': [nan, nan, nan, nan, nan], 'SEQUENCE': [nan, nan, nan, nan, nan], 'SPOT_ID.1': [nan, nan, nan, nan, nan], 'ORDER': [1.0, 2.0, 3.

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Hypothyroidism/gene_data/GSE75678.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Hypothyroidism')

# 4. Save the cohort information.
save_cohort_info('GSE75678', './preprocessed/Hypothyroidism/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Hypothyroidism/GSE75678.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Hypothyroidism', the least common label is '1.0' with 1 occurrences. This represents 1.85% of the dataset.
The distribution of the feature 'Hypothyroidism' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 43.0
  50% (Median): 49.5
  75%: 56.75
Min: 31.0
Max: 87.0
The distribution of the feature 'Age' in this dataset is fine.

A new JSON file was created at: ./preprocessed/Hypothyroidism/cohort_info.json
