In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Kidney_Chromophobe/GSE57162'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A genomic algorithm for the molecular classification of renal cortical neoplasms: Development and Validation"
!Series_summary	"Accurate diagnostic discrimination of benign renal oncocytoma (OC) and malignant renal cell carcinomas (RCC) is not only useful for planning appropriate treatment strategies of patients with renal masses but also for estimating prognosis. Classification of renal neoplasms solely by histopathology can often be challenging for a variety of reasons. The aim of this study was to develop and validate a genomic algorithm for molecular classification of renal cortical neoplasms that could be implemented in a routine clinical diagnostic setting. Using TCGA (The Cancer Genome Atlas) copy number profiles of over 600 RCC specimens, prior FISH studies and published literature, a classification algorithm was developed consisting of 15 genomic markers: loss of VHL, 3p21, 8p, and chromosomes 1, 2, 6, 10 and 17, and gain of 5qter, 16p, 17

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Step 1: Gene Expression Data Availability
if 'gene expression' in ["Accurate diagnostic discrimination", "Validation", "criteria for scoring specimens"]:  # Example checks, please tailor it better
    is_gene_available = True

# Step 2.1: Variable Availability
sample_characteristics = {
    0: ['sample type: Formalin Fixed Paraffin Embedded (FFPE)'],
    1: ['tissue: renal cell carcinoma specimen'],
    2: ['histology: OC', 'histology: pRCC', 'histology: chrRCC', 'histology: ccRCC'],
    3: ['gender: F', 'gender: M'],
    4: ['age: 66', 'age: 82', 'age: 59', 'age: 58', 'age: 57', 'age: 62', 'age: 48', 'age: 69', 'age: 76', 'age: 63', 'age: 74', 'age: 77', 'age: 79', 'age: 45', 'age: 67', 'age: 52', 'age: 68', 'age: 43', 'age: 39', 'age: 53', 'age: 49', 'age: 47', 'age: 56', 'age: 61', 'age: 72', 'age: 41', 'age: 55', 'age: 65', 'age: 73', 'age: 50'],
    5: ['furhman nuclear grade: na', 'furhman nuclear grade: 3', 'furhman nuclear grade: 4', 'furhman nuclear grade: 3 and 4', 'furhman nuclear grade: 2', 'furhman nuclear grade: 2;3', 'furhman nuclear grade: 1'],
    6: ['tnm stage: na', 'tnm stage: T1', 'tnm stage: T2', 'tnm stage: T3a', 'tnm stage: T4', 'tnm stage: T3', 'tnm stage: T3a M1', 'tnm stage: T2 M1', 'tnm stage: T3c', 'tnm stage: T3a N1'],
    7: ['tumor size (mm): 29', 'tumor size (mm): 38', 'tumor size (mm): 32', 'tumor size (mm): 110', 'tumor size (mm): 25', 'tumor size (mm): 14', 'tumor size (mm): 52', 'tumor size (mm): 85', 'tumor size (mm): 21', 'tumor size (mm): 115', 'tumor size (mm): 40', 'tumor size (mm): 80', 'tumor size (mm): 35', 'tumor size (mm): 42', 'tumor size (mm): 30', 'tumor size (mm): 79', 'tumor size (mm): 64', 'tumor size (mm): 45', 'tumor size (mm): 31', 'tumor size (mm): 48', 'tumor size (mm): 17', 'tumor size (mm): 140', 'tumor size (mm): 36', 'tumor size (mm): 27', 'tumor size (mm): 58', 'tumor size (mm): 20', 'tumor size (mm): 95', 'tumor size (mm): 22', 'tumor size (mm): 70', 'tumor size (mm): 23']
}

# Identify `trait_row`
if len(set(sample_characteristics[2])) > 1:
    trait_row = 2

# Identify `age_row`
if len(set(sample_characteristics[4])) > 1:
    age_row = 4

# Identify `gender_row`
if len(set(sample_characteristics[3])) > 1:
    gender_row = 3

# Step 2.3: Data Type Conversion
def convert_trait(value):
    try:
        trait_mapping = {'histology: chrRCC': 1, 'histology: OC': 0, 'histology: pRCC': 0, 'histology: ccRCC': 0}
        key = value.split(': ')[1]
        return trait_mapping[key]
    except Exception:
        return None

def convert_age(value):
    try:
        return int(value.split(': ')[1])
    except Exception:
        return None

def convert_gender(value):
    try:
        gender_mapping = {'F': 0, 'M': 1}
        gender = value.split(': ')[1]
        return gender_mapping[gender]
    except Exception:
        return None

# Step 3: Save Metadata
save_cohort_info('GSE57162', './preprocessed/Kidney_Chromophobe/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Kidney_Chromophobe', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Kidney_Chromophobe/trait_data/GSE57162.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1376393': [None, 66, 0], 'GSM1376394': [None, 82, 0], 'GSM1376395': [None, 59, 0], 'GSM1376396': [None, 58, 0], 'GSM1376397': [None, 59, 0], 'GSM1376398': [None, 57, 0], 'GSM1376399': [None, 62, 0], 'GSM1376400': [None, 48, 0], 'GSM1376401': [None, 69, 0], 'GSM1376402': [None, 76, 0], 'GSM1376403': [None, 63, 0], 'GSM1376404': [None, 74, 0], 'GSM1376405': [None, 57, 0], 'GSM1376406': [None, 77, 0], 'GSM1376407': [None, 79, 0], 'GSM1376408': [None, 45, 0], 'GSM1376409': [None, 67, 0], 'GSM1376410': [None, 57, 0], 'GSM1376411': [None, 59, 0], 'GSM1376412': [None, 69, 0], 'GSM1376413': [None, 52, 0], 'GSM1376414': [None, 68, 0], 'GSM1376415': [None, 43, 0], 'GSM1376416': [None, 39, 0], 'GSM1376417': [None, 53, 0], 'GSM1376418': [None, 49, 0], 'GSM1376419': [None, 47, 0], 'GSM1376420': [None, 56, 0], 'GSM1376421': [None, 61, 0], 'GSM1376422': [None, 72, 0], 'GSM1376423': [None, 66, 0], 'GSM1376424': [None, 41, 0], 'GSM1376425': [None, 67, 0], 'GSM1376426': [None, 58, 0], 'GSM1376427':