In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Adrenocortical_Cancer/GSE76019'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling of pediatric adrenocortical tumors of patients treated on the Children's Oncology Group XXX protocol."
!Series_summary	"We have previously observed that expression of HLA genes associate with histology of adrenocortical tumors (PMID 17234769)."
!Series_summary	"Here, we used gene expression microarrays to associate the diagnostic tumor expression of these genes with outcome among 34 patients treated on the COG ARAR0332 protocol."
!Series_overall_design	"We used microarrays to explore the expression profiles of a large group of uniformly-treated pediatric adrenocortical carcinomas."
!Series_overall_design	"Specimens were harvested during surgery and snap frozen in liquid nitrogen to preserve tissue integrity."
Sample Characteristics Dictionary:
{0: ['histology: ACC'], 1: ['Stage: III', 'Stage: I', 'Stage: II', 'Stage: IV'], 2: ['efs.time: 5.07323750855578', 'efs.time: 5.17453798767967', 'efs.time: 4.33127994524298', 'efs.t

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Ensure 'sample_characteristics' is defined from previous steps
sample_characteristics = {
    0: ['histology: ACC'],
    1: ['Stage: III', 'Stage: I', 'Stage: II', 'Stage: IV'],
    2: ['efs.time: 5.07323750855578', 'efs.time: 5.17453798767967', 'efs.time: 4.33127994524298', 'efs.time: 4.50376454483231', 'efs.time: 4.29568788501027', 'efs.time: 5.48117727583847', 'efs.time: 4.290212183436', 'efs.time: 3.35112936344969', 'efs.time: 4.87063655030801', 'efs.time: 4.39972621492129', 'efs.time: 1.48665297741273', 'efs.time: 1.45927446954141', 'efs.time: 0.161533196440794', 'efs.time: 0.810403832991102', 'efs.time: 4.61601642710472', 'efs.time: 1.57700205338809', 'efs.time: 1.14989733059548', 'efs.time: 5.78781656399726', 'efs.time: 1.80150581793292', 'efs.time: 0.473648186173854', 'efs.time: 0.303901437371663', 'efs.time: 4.3066392881588', 'efs.time: 3.92881587953457', 'efs.time: 2.24503764544832', 'efs.time: 7.08829568788501', 'efs.time: 2.01232032854209', 'efs.time: 1.70841889117043', 'efs.time: 0.563997262149213', 'efs.time: 2.45311430527036', 'efs.time: 2.13004791238877'],
    3: ['efs.event: 0', 'efs.event: 1']
}

# Check if dataset contains gene expression data
is_gene_available = True

# Data Availability and Setting Rows
trait_row = 0 if 'histology: ACC' in sample_characteristics[0] else None  # Adrenocortical_Cancer presence
age_row = None  # Not found in sample_characteristics dictionary
gender_row = None  # Not found in sample_characteristics dictionary

# Data Type Conversion Functions
def convert_trait(value):
    if ':' in value:
        value = value.split(':')[1].strip()
    return 1 if value == 'ACC' else 0

def convert_age(value):
    # Not applicable, but placeholder
    if ':' in value:
        value = value.split(':')[1].strip()
    try:
        return float(value)
    except ValueError:
        return None

def convert_gender(value):
    # Not applicable, but placeholder
    if ':' in value:
        value = value.split(':')[1].strip()
    if value.lower() in ['male', 'm']:
        return 1
    elif value.lower() in ['female', 'f']:
        return 0
    else:
        return None

save_cohort_info('GSE76019', './preprocessed/Adrenocortical_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Adrenocortical_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Adrenocortical_Cancer/trait_data/GSE76019.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1972883': [1], 'GSM1972884': [1], 'GSM1972885': [1], 'GSM1972886': [1], 'GSM1972887': [1], 'GSM1972888': [1], 'GSM1972889': [1], 'GSM1972890': [1], 'GSM1972891': [1], 'GSM1972892': [1], 'GSM1972893': [1], 'GSM1972894': [1], 'GSM1972895': [1], 'GSM1972896': [1], 'GSM1972897': [1], 'GSM1972898': [1], 'GSM1972899': [1], 'GSM1972900': [1], 'GSM1972901': [1], 'GSM1972902': [1], 'GSM1972903': [1], 'GSM1972904': [1], 'GSM1972905': [1], 'GSM1972906': [1], 'GSM1972907': [1], 'GSM1972908': [1], 'GSM1972909': [1], 'GSM1972910': [1], 'GSM1972911': [1], 'GSM1972912': [1], 'GSM1972913': [1], 'GSM1972914': [1], 'GSM1972915': [1], 'GSM1972916': [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1007_PM_s_at', '1053_PM_at', '117_PM_at', '121_PM_at', '1255_PM_g_at',
       '1294_PM_at', '1316_PM_at', '1320_PM_at', '1405_PM_i_at', '1431_PM_at',
       '1438_PM_at', '1487_PM_at', '1494_PM_f_at', '1552256_PM_a_at',
       '1552257_PM_a_at', '1552258_PM_at', '1552261_PM_at', '1552263_PM_at',
       '1552264_PM_a_at', '1552266_PM_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1007_PM_s_at', '1053_PM_at', '117_PM_at', '121_PM_at', '1255_PM_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identifying the keys for mapping
identifier_key = 'ID'
gene_symbol_key = 'Gene Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping and name the resulting gene expression dataframe "gene_data"
gene_data = apply_gene_mapping(gene_data, mapping_df)

print(preview_df(gene_data))


{'GSM1972883': [8.146157993, 7.723733924, 8.106406367, 2.661259618, 2.790370222], 'GSM1972884': [10.97781013, 8.084655772, 7.817518297, 2.671671748, 2.759888745], 'GSM1972885': [10.81797395, 6.997038152, 8.597533433, 2.59328481, 2.8383738965000003], 'GSM1972886': [8.328434774, 6.348997492, 7.436279129, 2.5778280175, 2.8667040994999997], 'GSM1972887': [7.702278033, 7.657458655, 7.458639933, 2.4051971705, 2.899911265], 'GSM1972888': [9.928566824, 5.774165731, 6.967079491, 2.6401313715, 2.740074608], 'GSM1972889': [10.18914043, 6.13370338, 7.762308472, 2.4879254375, 2.7641213645000002], 'GSM1972890': [10.4183845, 4.234779993, 7.919931878, 2.5828736645, 2.775006845], 'GSM1972891': [10.70367552, 5.267539176, 7.028197349, 2.530570785, 2.780008706], 'GSM1972892': [6.957320173, 5.438076322, 8.076595302, 2.657053212, 3.1043811005], 'GSM1972893': [11.67347203, 6.112399541, 7.244518496, 2.6197241179999997, 2.878003272], 'GSM1972894': [10.83962574, 5.129350437, 6.579514723, 2.658415803, 2.92980201

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Adrenocortical_Cancer/gene_data/GSE76019.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Adrenocortical_Cancer')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE76019', './preprocessed/Adrenocortical_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Adrenocortical_Cancer/GSE76019.csv'
    unbiased_merged_data.to_csv(csv_path)


Quartiles for 'Adrenocortical_Cancer':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Adrenocortical_Cancer' in this dataset is severely biased.

