In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Retinoblastoma/GSE58780'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Retinoblastoma gene expression data"
!Series_summary	"Retinoblastoma is the most common intraocular cancer of infancy and childhood, with an incidence of one case per 15,000 - 20,000 live births. An early event in retinoblastoma genesis is a functional loss of both alleles of the RB1 gene. However, other genes are likely to be involved in the development of this cancer. In this study we sought to build a comprehensive molecular portrait of this cancer by performing transcriptomic, methylomic, genomic profiling of primary retinoblastoma samples. Most of the patients whose tumors were studied had received no treatment prior to surgical enucleation."
!Series_overall_design	"Gene expression of 63 samples retinoblastomas tumor and 3 fetal retina were assesed using the Hg-U133 Plus 2.0 Affymetrix array"
Sample Characteristics Dictionary:
{0: ['geo dataset serie: SAMPLE 1', 'geo dataset serie: SAMPLE 2', 'geo dataset serie: SAMPLE 4', 'geo dataset serie:

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # Set to different values when applicable
convert_trait = convert_age = convert_gender = None  # Define the functions when applicable

# Check for gene expression data
is_gene_available = True  # Based on the design mentioning the use of Hg-U133 Plus 2.0 Affymetrix array

# Determine the rows for the variables and define their corresponding conversion functions
sample_characteristics = {0: ['geo dataset serie: SAMPLE 1', 'geo dataset serie: SAMPLE 2', 'geo dataset serie: SAMPLE 4', 'geo dataset serie: SAMPLE 5', 'geo dataset serie: SAMPLE 6', 'geo dataset serie: SAMPLE 7', 'geo dataset serie: SAMPLE 8', 'geo dataset serie: SAMPLE 9', 'geo dataset serie: SAMPLE 12', 'geo dataset serie: SAMPLE 13', 'geo dataset serie: SAMPLE 14', 'geo dataset serie: SAMPLE 15', 'geo dataset serie: SAMPLE 16', 'geo dataset serie: SAMPLE 17', 'geo dataset serie: SAMPLE 18', 'geo dataset serie: SAMPLE 19', 'geo dataset serie: SAMPLE 20', 'geo dataset serie: SAMPLE 23', 'geo dataset serie: SAMPLE 24', 'geo dataset serie: SAMPLE 25', 'geo dataset serie: SAMPLE 26', 'geo dataset serie: SAMPLE 27', 'geo dataset serie: SAMPLE 28', 'geo dataset serie: SAMPLE 29', 'geo dataset serie: SAMPLE 30', 'geo dataset serie: SAMPLE 31', 'geo dataset serie: SAMPLE 32', 'geo dataset serie: SAMPLE 33', 'geo dataset serie: SAMPLE 34', 'geo dataset serie: SAMPLE 35'], 1: ['batch: RB01', 'batch: RT09'], 2: ['tissue: retinoblastoma', 'tissue: fetal retina']}

# Retinoblastoma (trait_row)
for key, values in sample_characteristics.items():
    if any('retinoblastoma' in value.lower() for value in values):
        trait_row = key
        break

# Age (age_row)
# No suitable key found for age based on the provided output
age_row = None

# Gender (gender_row)
# No suitable key found for gender based on the provided output
gender_row = None

# Define conversion functions
def convert_trait(value):
    val = value.split(':')[1].strip().lower()
    if 'retinoblastoma' in val:
        return 1
    elif 'fetal retina' in val:
        return 0
    else:
        return None

# Age and gender row are not available in the provided data
convert_age = lambda x: None
convert_gender = lambda x: None

# Save cohort information
save_cohort_info('GSE58780', './preprocessed/Retinoblastoma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if trait data is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Retinoblastoma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Retinoblastoma/trait_data/GSE58780.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM5121283': [1], 'GSM5121284': [1], 'GSM5121285': [1], 'GSM5121286': [1], 'GSM5121287': [1], 'GSM5121288': [1], 'GSM5121289': [1], 'GSM5121290': [1], 'GSM5121291': [1], 'GSM5121292': [1], 'GSM5121293': [1], 'GSM5121294': [1], 'GSM5121295': [1], 'GSM5121296': [1], 'GSM5121297': [1], 'GSM5121298': [1], 'GSM5121299': [1], 'GSM5121300': [1], 'GSM5121301': [1], 'GSM5121302': [1], 'GSM5121303': [1], 'GSM5121304': [1], 'GSM5121305': [1], 'GSM5121306': [1], 'GSM5121307': [1], 'GSM5121308': [1], 'GSM5121309': [1], 'GSM5121310': [1], 'GSM5121311': [1], 'GSM5121312': [1], 'GSM5121313': [1], 'GSM5121314': [1], 'GSM5121315': [1], 'GSM5121316': [1], 'GSM5121317': [1], 'GSM5121318': [1], 'GSM5121319': [1], 'GSM5121320': [1], 'GSM5121321': [1], 'GSM5121322': [1], 'GSM5121323': [1], 'GSM5121324': [1], 'GSM5121325': [1], 'GSM5121326': [1], 'GSM5121327': [1], 'GSM5121328': [1], 'GSM5121329': [1], 'GSM5121330': [1], 'GSM5121331': [1], 'GSM5121332': [1], 'GSM5121333': [1], 'GSM5121334': [1], 'GSM5121335

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['100009676_at', '10000_at', '10001_at', '10002_at', '100038246_at',
       '10003_at', '100048912_at', '100049716_at', '10004_at', '10005_at',
       '10006_at', '10007_at', '10008_at', '100093630_at', '100093698_at',
       '10009_at', '1000_at', '100101467_at', '100101938_at', '10010_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1_at', '10_at', '100_at', '1000_at', '10000_at'], 'ENTREZ_GENE_ID': ['1', '10', '100', '1000', '10000'], 'Description': ['alpha-1-B glycoprotein', 'N-acetyltransferase 2', 'adenosine deaminase', 'cadherin 2', 'AKT serine/threonine kinase 3'], 'SPOT_ID': ['1_at', '10_at', '100_at', '1000_at', '10000_at']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'Description'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function
gene_data = apply_gene_mapping(gene_data, mapping_df)

print("Mapped gene data preview:")
print(preview_df(gene_data))


Mapped gene data preview:
{'GSM5121283': [4.175071608686542, 6.684869774254267, 8.28559391007244, 3.908870716333, 2.34327069996438], 'GSM5121284': [4.1576767381015545, 6.30139112936447, 8.16539768354123, 4.11687634788922, 2.27146937829736], 'GSM5121285': [4.1612875264341636, 6.547154998452426, 8.01683809181758, 3.76601489788715, 2.69085275288911], 'GSM5121286': [4.148809080893588, 6.30900969474258, 7.8275205760969, 4.97957215335641, 2.66647336364899], 'GSM5121287': [4.122722529671975, 6.543856241926488, 7.12126325800596, 4.3396273483284, 2.69821050969402], 'GSM5121288': [4.2216905664474025, 6.559659118342911, 7.13279651633661, 4.45394629203781, 2.56891806509564], 'GSM5121289': [4.18415728218948, 6.449994484393806, 6.39116134858068, 4.93328054370578, 2.3047157061124], 'GSM5121290': [4.155785107387193, 6.442174900963833, 7.59574809485074, 5.20344256016961, 2.63311632200553], 'GSM5121291': [4.1335929069294615, 6.542076379638577, 7.07042325893501, 4.28254036948428, 2.4187032583715], 'GSM51

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Retinoblastoma/gene_data/GSE58780.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Retinoblastoma')

# 4. Save the cohort information.
save_cohort_info('GSE58780', './preprocessed/Retinoblastoma/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Retinoblastoma/GSE58780.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Retinoblastoma', the least common label is '0.0' with 3 occurrences. This represents 4.55% of the dataset.
The distribution of the feature 'Retinoblastoma' in this dataset is severely biased.

