In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Retinoblastoma/GSE208143'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"mRNA expression profile from retinoblastoma tumors and pediatric controls"
!Series_summary	"To discover differentially expressed mRNA's in Rb tumors compared to pediatric retina"
!Series_overall_design	"Nine enucleated human retinoblastoma tumors and two pediatric retina controls used for the study. Total RNA was isolated from 9 Rb tumors and 2 control pediatric retina samples using Agilent Absolutely RNA miRNA kit. Twenty-five nanograms of RNA from Rb tumors and control pediatric retina samples were labeled with Cy3 dye using an Agilent Low Input Quick Amp Labeling Kit"
Sample Characteristics Dictionary:
{0: ['tissue: Tumor', 'tissue: Pediatric Retina'], 1: ['gender: Male', 'gender: Female']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
if "expression" in "!Series_title" or "expression" in "!Series_summary":
    is_gene_available = True

# Identify keys for trait, age, and gender
sample_characteristics_dict = {
    0: ['tissue: Tumor', 'tissue: Pediatric Retina'],
    1: ['gender: Male', 'gender: Female'] 
}

# Retinoblastoma trait presence
if 'tissue: Tumor' in sample_characteristics_dict[0] or 'tissue: Pediatric Retina' in sample_characteristics_dict[0]:
    trait_row = 0  # Assuming this is the key for the trait

# Age is not specified in the provided data
age_row = None

# Gender presence
if 'gender: Male' in sample_characteristics_dict[1] or 'gender: Female' in sample_characteristics_dict[1]:
    gender_row = 1  # Assuming this is the key for gender

# Define conversion functions
def extract_value(cell):
    return cell.split(': ')[1] if ': ' in cell else None

def convert_trait(value):
    val = extract_value(value)
    if val == 'Tumor':
        return 1  # Assuming Tumor is the presence of Retinoblastoma
    elif val == 'Pediatric Retina':
        return 0  # Assuming Pediatric Retina as controls (no Retinoblastoma)
    return None

def convert_age(value):
    val = extract_value(value)
    try:
        return float(val)  # Assuming age is given in continuous numeric form
    except (TypeError, ValueError):
        return None

def convert_gender(value):
    val = extract_value(value)
    if val == 'Male':
        return 1
    elif val == 'Female':
        return 0
    return None

# Save Metadata
save_cohort_info('GSE208143', './preprocessed/Retinoblastoma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Retinoblastoma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Retinoblastoma/trait_data/GSE208143.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6338046': [1, 1], 'GSM6338047': [1, 1], 'GSM6338048': [1, 1], 'GSM6338049': [1, 0], 'GSM6338050': [1, 0], 'GSM6338051': [1, 0], 'GSM6338052': [1, 1], 'GSM6338053': [1, 1], 'GSM6338054': [1, 1], 'GSM6338055': [1, 0], 'GSM6338056': [1, 0], 'GSM6338057': [1, 0], 'GSM6338058': [1, 1], 'GSM6338059': [1, 1], 'GSM6338060': [1, 1], 'GSM6338061': [1, 0], 'GSM6338062': [1, 0], 'GSM6338063': [1, 0], 'GSM6338064': [1, 0], 'GSM6338065': [1, 0], 'GSM6338066': [1, 0], 'GSM6338067': [1, 1], 'GSM6338068': [1, 1], 'GSM6338069': [1, 1], 'GSM6338070': [1, 1], 'GSM6338071': [1, 1], 'GSM6338072': [1, 1], 'GSM6338073': [0, 0], 'GSM6338074': [0, 0], 'GSM6338075': [0, 0], 'GSM6338076': [0, 0], 'GSM6338077': [0, 0], 'GSM6338078': [0, 0]}
