In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Gastroesophageal_reflux_disease_(GERD)/GSE28302'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Whole genome expression array profiling highlights differences in mucosal defense genes in Barrett's esophagus and esophageal adenocarcinoma."
!Series_summary	"Esophageal adenocarcinoma (EAC) has become a major concern in Western countries due to rapid rises in incidence coupled with very poor survival rates. One of the key risk factors for the development of this cancer is the presence of Barrett’s esophagus (BE), which is believed to form in response to repeated gastro-esophageal reflux. In this study we performed comparative, genome-wide expression profiling (using Illumina whole-genome Beadarray) on total RNA extracted from esophageal biopsy tissues from individuals with EAC, BE (in the absence of EAC) and those with normal squamous epithelium. We combined these data with publically accessible raw data from three similar studies to investigate key gene and ontology differences between these three tissue states. The results support the deductio

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # Based on the summary, this dataset uses Illumina whole-genome Beadarray for gene expression profiling.

# Check availability and determine keys for variables
keys_dict = {
    'Gastroesophageal_reflux_disease_(GERD)': 0,  # Based on tissue type which indicates GERD
    'age': 4,  # Age is explicitly mentioned
    'gender': 3  # Gender is explicitly mentioned
}

# Assign keys to variables
trait_row = keys_dict.get('Gastroesophageal_reflux_disease_(GERD)', None)
age_row = keys_dict.get('age', None)
gender_row = keys_dict.get('gender', None)

# Function to convert trait
def convert_trait(value):
    if value.lower().startswith('tissue type:'):
        if 'normal esophageal squamous' in value.lower():
            return 0
        elif "barrett's esophagus (without dysplasia)" in value.lower():
            return 1
        elif 'esophageal adenocarcinoma tumor' in value.lower():
            return 2
    return None

# Function to convert age
def convert_age(value):
    try:
        parts = value.split(':')
        if len(parts) == 2:
            return float(parts[1].strip())
    except:
        return None

# Function to convert gender
def convert_gender(value):
    if value.lower().startswith('subject gender:'):
        gender = value.split(':')[1].strip().lower()
        if gender == 'female':
            return 0
        elif gender == 'male':
            return 1
    return None

# Save cohort information
save_cohort_info('GSE28302', './preprocessed/Gastroesophageal_reflux_disease_(GERD)/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if clinical data is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Gastroesophageal_reflux_disease_(GERD)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Gastroesophageal_reflux_disease_(GERD)/trait_data/GSE28302.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM700266': [0.0, 73.0, 0.0], 'GSM700267': [0.0, 55.0, 1.0], 'GSM700268': [0.0, 66.0, 0.0], 'GSM700269': [0.0, 21.0, 1.0], 'GSM700270': [0.0, 48.0, 1.0], 'GSM700271': [0.0, 41.0, 0.0], 'GSM700272': [0.0, 31.0, 0.0], 'GSM700273': [0.0, 80.0, 1.0], 'GSM700274': [0.0, 45.0, 0.0], 'GSM700275': [1.0, 48.0, 0.0], 'GSM700276': [1.0, 75.0, 1.0], 'GSM700277': [1.0, 60.0, 0.0], 'GSM700278': [1.0, 72.0, 1.0], 'GSM700279': [1.0, 56.0, 1.0], 'GSM700280': [1.0, 80.0, 1.0], 'GSM700281': [1.0, 60.0, 0.0], 'GSM700282': [1.0, 47.0, 1.0], 'GSM700283': [1.0, 78.0, 1.0], 'GSM700284': [1.0, 45.0, 0.0], 'GSM700285': [1.0, 65.0, 1.0], 'GSM700286': [1.0, 68.0, 0.0], 'GSM700287': [1.0, 47.0, 1.0], 'GSM700288': [1.0, 43.0, 1.0], 'GSM700289': [1.0, 68.0, 0.0], 'GSM700290': [1.0, 67.0, 0.0], 'GSM700291': [1.0, 69.0, 1.0], 'GSM700292': [1.0, 78.0, 1.0], 'GSM700293': [1.0, 48.0, 1.0], 'GSM700294': [1.0, 57.0, 1.0], 'GSM700295': [1.0, 77.0, 0.0], 'GSM700296': [1.0, 47.0, 1.0], 'GSM700297': [2.0, 61.0, 1.0], 'GSM700

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['GI_10047089-S', 'GI_10047091-S', 'GI_10047093-S', 'GI_10047099-S',
       'GI_10047103-S', 'GI_10047123-S', 'GI_10047133-A', 'GI_10047133-I',
       'GI_10092578-S', 'GI_10092585-S', 'GI_10092596-S', 'GI_10092600-S',
       'GI_10092602-S', 'GI_10092611-A', 'GI_10092616-S', 'GI_10092618-S',
       'GI_10092638-S', 'GI_10092658-S', 'GI_10092668-S', 'GI_10092672-S'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['GI_10047089-S', 'GI_10047091-S', 'GI_10047093-S', 'GI_10047099-S', 'GI_10047103-S'], 'SequenceSource': ['RefSeq', 'RefSeq', 'RefSeq', 'RefSeq', 'RefSeq'], 'GB_ACC': ['NM_014332.1', 'NM_013259.1', 'NM_016299.1', 'NM_016303.1', 'NM_016305.1'], 'Annotation Date': [nan, nan, nan, nan, nan], 'SPOT_ID': [nan, nan, nan, nan, nan]}


### Step 6: Gene Identifier Mapping

In [7]:
if requires_gene_mapping:
    # 1. Identify the keys for probe IDs and gene symbols
    identifier_key = 'ID'
    gene_symbol_key = 'GB_ACC'
    
    # 2. Get the dataframe storing the mapping between probe IDs and genes
    gene_mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    
    # 3. Apply the mapping to get the gene expression dataframe
    gene_data = apply_gene_mapping(gene_data, gene_mapping_df)


normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Gastroesophageal_reflux_disease_(GERD)/gene_data/GSE28302.csv'
normalized_gene_data.to_csv(gene_csv_path)

merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Gastroesophageal_reflux_disease_(GERD)')

if not trait_biased:
    save_cohort_info('GSE28302', './preprocessed/Gastroesophageal_reflux_disease_(GERD)/cohort_info.json', True, True, trait_biased, unbiased_merged_data)
    csv_path = './preprocessed/Gastroesophageal_reflux_disease_(GERD)/GSE28302.csv'
    unbiased_merged_data.to_csv(csv_path)
else:
    save_cohort_info('GSE28302', './preprocessed/Gastroesophageal_reflux_disease_(GERD)/cohort_info.json', True, True, trait_biased, merged_data)


No gene data in the dataframe
