In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Retinoblastoma/GSE110811'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)
    

import numpy as np  # Import numpy to handle NaN values


Background Information:
!Series_title	"Distinct Gene Expression Profiles Define Anaplastic Grade in Retinoblastoma"
!Series_summary	"Morbidity and mortality associated with retinoblastoma have decreased drastically in recent decades, in large part due to better prediction of high-risk disease and appropriate treatment stratification. High-risk histopathologic features and severe anaplasia both predict the need for more aggressive treatment; however, not all centers are able to easily assess tumor samples for degree of anaplasia. Instead, identification of genetic signatures able to distinguish among anaplastic grades and thus predict high versus low risk retinoblastoma would facilitate appropriate risk stratification in a wider patient population. A better understanding of genes dysregulated in anaplasia would also yield valuable insights into pathways underlying the development of more severe retinoblastoma. Here, we present the histopathologic and gene expression analysis of 28 retin

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if the dataset contains gene expression data
is_gene_available = True

# Variable Availability and Data Type Conversion

# Retinoblastoma trait
trait_row = 1  # Since the anaplasia grade indicates retinoblastoma severity

def convert_trait(value):
    if pd.isna(value):
        return None  # Handling NaN values
    if 'Mild' in value:
        return 0
    elif 'Moderate' in value or 'Severe' in value:
        return 1  # For the study, we consider Moderate/Severe as positive traits
    else:
        return None

# Age and Gender data unavailable
age_row = None
gender_row = None

def convert_age(value):
    return None  # Not applicable

def convert_gender(value):
    return None  # Not applicable

# Save Metadata
save_cohort_info('GSE110811', './preprocessed/Retinoblastoma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Retinoblastoma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Retinoblastoma/trait_data/GSE110811.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3017123': [0], 'GSM3017124': [0], 'GSM3017125': [0], 'GSM3017126': [0], 'GSM3017127': [0], 'GSM3017128': [0], 'GSM3017129': [1], 'GSM3017130': [1], 'GSM3017131': [1], 'GSM3017132': [1], 'GSM3017133': [1], 'GSM3017134': [1], 'GSM3017135': [1], 'GSM3017136': [1], 'GSM3017137': [1], 'GSM3017138': [1], 'GSM3017139': [1], 'GSM3017140': [1], 'GSM3017141': [1], 'GSM3017142': [1], 'GSM3017143': [1], 'GSM3017144': [1], 'GSM3017145': [1], 'GSM3017146': [1], 'GSM3017147': [1], 'GSM3017148': [1], 'GSM3017149': [1], 'GSM3017150': [1], 'GSM3017151': [None], 'GSM3017152': [None], 'GSM3017153': [None], 'GSM3017154': [None], 'GSM3017155': [None], 'GSM3017156': [None]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['16657445', '16657492', '16657502', '16657506', '16657514', '16657529',
       '16657534', '16657554', '16657572', '16657594', '16657598', '16657647',
       '16657650', '16657652', '16657654', '16657656', '16657680', '16657683',
       '16657713', '16657730'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': [12190.0, 29554.0, 69091.0, 160446.0, 317811.0], 'RANGE_END': [13639.0, 31109.0, 70008.0, 161525.0, 328581.0], 'total_probes': [25.0, 28.0, 8.0, 13.0, 36.0], 'GB_ACC': ['NR_046018', nan, nan, nan, 'NR_024368'], 'SPOT_ID': ['chr1:12190-13639', 'chr1:29554-31109', 'chr1:69091-70008', 'chr1:160446-161525', 'chr1:317811-328581'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine which keys correspond to IDs and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'SPOT_ID'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, gene_mapping)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Retinoblastoma/gene_data/GSE110811.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Retinoblastoma')

# 4. Save the cohort information (using trait_biased instead of trait_biased).
save_cohort_info('GSE110811', './preprocessed/Retinoblastoma/cohort_info.json', True, True, trait_biased, merged_data)

# 5. If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    csv_path = './preprocessed/Retinoblastoma/GSE110811.csv'
    unbiased_merged_data.to_csv(csv_path)


No gene data in the dataframe
