In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Breast_Cancer/GSE248830'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Unlocking Molecular mechanisms and identifying druggable targets in matched-paired brain metastasis of Breast and Lung cancers "
!Series_summary	"Introduction: The incidence of brain metastases in cancer patients is increasing, with lung and breast cancer being the most common sources. Despite advancements in targeted therapies, the prognosis remains poor, highlighting the importance to investigate the underlying mechanisms in brain metastases. The aim of this study was to investigate the differences in the molecular mechanisms involved in brain metastasis of breast and lung cancers. In addition, we aimed to identify cancer lineage-specific druggable targets in the brain metastasis. Methods: To that aim, a cohort of 44 FFPE tissue samples, including 22 breast cancer and 22 lung adenocarcinoma (LUAD) and their matched-paired brain metastases were collected. Targeted gene expression profiles of primary tumors were compared to their matched-paired br

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

sample_characteristics = {
    0: ['age at diagnosis: 49', 'age at diagnosis: 44', 'age at diagnosis: 41', 'age at diagnosis: 40', 'age at diagnosis: 48', 'age at diagnosis: 42', 'age at diagnosis: 47', 'age at diagnosis: 53', 'age at diagnosis: 74', 'age at diagnosis: 58', 'age at diagnosis: 51', 'age at diagnosis: 55', 'age at diagnosis: 46', 'age at diagnosis: 59', 'age at diagnosis: 50', 'age at diagnosis: 57', 'age at diagnosis: 60', 'age at diagnosis: 69', 'age at diagnosis: n.a.', 'age at diagnosis: 65', 'age at diagnosis: 37', 'age at diagnosis: 63', 'age at diagnosis: 70', 'age at diagnosis: 66', 'age at diagnosis: 64'], 
    1: ['Sex: female', 'Sex: male'], 
    2: ['histology: TNBC', 'histology: ER+ PR+ HER2-', 'histology: Unknown', 'histology: ER- PR- HER2+', 'histology: ER+ PR-HER2+', 'histology: ER+ PR- HER2-', 'histology: ER- PR+ HER2-', 'histology: adenocarcinoma'], 
    3: ['smoking status: n.a', 'smoking status: former-smoker', 'smoking status: smoker', 'smoking status: Never smoking', 'smoking status: unknown', 'smoking status: former-roker'], 
    4: ['treatment after surgery of bm: surgery + chemotherpy', 'treatment after surgery of bm: surgery +  chemotherpy + Radiotherapy', 'treatment after surgery of bm: surgery + chemotherapy + Radiotherapy', 'treatment after surgery of bm: surgery', 'treatment after surgery of bm: surgery +  chemotherapy + Radiotherapy', 'treatment after surgery of bm: surgery + chemotherapy', 'treatment after surgery of bm: surgery + chemotherpy + Radiotherapy', 'treatment after surgery of bm: surgery + chemotheapy + Radiotherapy', 'treatment after surgery of bm: Chemoterapy', 'treatment after surgery of bm: Radiotherapy & Chemoterapy', 'treatment after surgery of bm: Radiotherapy', 'treatment after surgery of bm: Other', 'treatment after surgery of bm: Surgery & Chemotherapy & Radiotherapy', 'treatment after surgery of bm: surgery & Radiotherapy', 'treatment after surgery of bm: surgery & Radiochemotherapy', 'treatment after surgery of bm: No treatment', 'treatment after surgery of bm: WBRT', 'treatment after surgery of bm: SRT']
}

# Check if the dataset contains gene expression data
is_gene_available = True  # Based on the series summary and design, gene expression data seems to be present

# Determine the availability and keys for 'Breast_Cancer', 'age', and 'gender'
age_row = 0 if len(set([item.split(": ")[1] for item in sample_characteristics[0]])) > 1 else None
gender_row = 1 if len(set([item.split(": ")[1] for item in sample_characteristics[1]])) > 1 else None
trait_row = 2  # Explicitly mentioned histology with different cancer types, assuming breast cancer is included in histology

# Define conversion functions
def convert_trait(value):
    value = value.split(": ")[1]
    mapping = {
        'TNBC': 1,
        'ER+ PR+ HER2-': 1,
        'ER- PR- HER2+': 1,
        'ER+ PR-HER2+': 1,
        'ER+ PR- HER2-': 1,
        'ER- PR+ HER2-': 1,
        'adenocarcinoma': 0,
        'Unknown': None
    }
    return mapping.get(value, None)

def convert_age(value):
    value = value.split(": ")[1]
    try:
        return float(value) if value != 'n.a.' else None
    except ValueError:
        return None

def convert_gender(value):
    value = value.split(": ")[1]
    mapping = {
        'female': 0,
        'male': 1,
    }
    return mapping.get(value, None)

# Save metadata
save_cohort_info('GSE248830', './preprocessed/Breast_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Breast_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Breast_Cancer/trait_data/GSE248830.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM7920782': [1.0, 49.0, 0.0], 'GSM7920783': [1.0, 44.0, 0.0], 'GSM7920784': [nan, 41.0, 0.0], 'GSM7920785': [1.0, 40.0, 0.0], 'GSM7920786': [1.0, 48.0, 0.0], 'GSM7920787': [nan, 42.0, 0.0], 'GSM7920788': [1.0, 47.0, 0.0], 'GSM7920789': [1.0, 53.0, 0.0], 'GSM7920790': [1.0, 41.0, 0.0], 'GSM7920791': [1.0, 74.0, 0.0], 'GSM7920792': [1.0, 58.0, 0.0], 'GSM7920793': [1.0, 51.0, 0.0], 'GSM7920794': [1.0, 55.0, 0.0], 'GSM7920795': [nan, 46.0, 0.0], 'GSM7920796': [1.0, 46.0, 0.0], 'GSM7920797': [1.0, 48.0, 0.0], 'GSM7920798': [1.0, 44.0, 0.0], 'GSM7920799': [1.0, 49.0, 0.0], 'GSM7920800': [1.0, 59.0, 0.0], 'GSM7920801': [1.0, 50.0, 0.0], 'GSM7920802': [1.0, 74.0, 0.0], 'GSM7920803': [1.0, 46.0, 0.0], 'GSM7920804': [nan, 40.0, 0.0], 'GSM7920805': [nan, 57.0, 1.0], 'GSM7920806': [nan, 60.0, 1.0], 'GSM7920807': [nan, 55.0, 0.0], 'GSM7920808': [nan, 69.0, 0.0], 'GSM7920809': [None, None, 0], 'GSM7920810': [None, None, 1], 'GSM7920811': [nan, 57.0, 1.0], 'GSM7920812': [None, None, 0], 'GSM792081

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['A2M', 'ACVR1C', 'ADAM12', 'ADGRE1', 'ADM', 'ADORA2A', 'AKT1', 'ALDOA',
       'ALDOC', 'ANGPT1', 'ANGPT2', 'ANGPTL4', 'ANLN', 'APC', 'APH1B', 'API5',
       'APLNR', 'APOE', 'APOL6', 'AQP9'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = False


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))
    

# Since requires_gene_mapping is False, we will skip STEP6 and proceed to the next steps.


Gene annotation preview:
{'ID': ['A2M', 'ACVR1C', 'ADAM12', 'ADGRE1', 'ADM'], 'NS Probe ID': ['NM_000014.4:1685', 'NM_145259.2:5168', 'NM_003474.5:638', 'NM_001256252.1:1130', 'NM_001124.2:1413'], 'GB_ACC': ['NM_000014.4', 'NM_145259.2', 'NM_003474.5', 'NM_001256252.1', 'NM_001124.2'], 'Class Name': ['Endogenous', 'Endogenous', 'Endogenous', 'Endogenous', 'Endogenous'], 'Analyte Type': ['mRNA', 'mRNA', 'mRNA', 'mRNA', 'mRNA'], 'SPOT_ID': [nan, nan, nan, nan, nan]}


### Step 7: Data Normalization and Merging

In [7]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Breast_Cancer/gene_data/GSE248830.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Breast_Cancer')

# 4. Save the cohort information. Ensure to use the defined trait_biased variable.
save_cohort_info('GSE248830', './preprocessed/Breast_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

# 5. If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    csv_path = './preprocessed/Breast_Cancer/GSE248830.csv'
    unbiased_merged_data.to_csv(csv_path)


Quartiles for 'Breast_Cancer':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Breast_Cancer' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 46.0
  50% (Median): 49.0
  75%: 54.0
Min: 40.0
Max: 74.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 19 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.

