In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Kidney_Chromophobe/GSE19982'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression discriminates chromophobe renal cell carcinoma and oncocytoma"
!Series_summary	"[original title] Genomic expression and single-nucleotide polymorphism profiling discriminates chromophobe renal cell carcinoma and oncocytoma."
!Series_summary	""
!Series_summary	"Background : Chromophobe renal cell carcinoma (chRCC) and renal oncocytoma are two distinct but closely related entities with strong morphologic and genetic similarities.  While chRCC is a malignant tumor, oncocytoma is usually regarded as a benign entity.  The overlapping characteristics are best explained by a common cellular origin, and the biologic differences between chRCC and oncocytoma are therefore of considerable interest in terms of carcinogenesis, diagnosis and clinical management. Previous studies have been relatively limited in terms of examining the differences between oncocytoma and chromophobe RCC."
!Series_summary	"Methods : Gene expression profiling using th

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking for gene expression data availability
if "Gene expression" in '!Series_title':
    is_gene_available = True
    
# Data availability
sample_characteristics = {0: ['disease state: Chromophobe renal cell carcinoma', 'disease state: Renal oncocytoma']}

if 0 in sample_characteristics:
    if len(set(sample_characteristics[0])) > 1:
        trait_row = 0

# Data type conversion
def convert_trait(value):
    mapping = {
        "Chromophobe renal cell carcinoma": 1,
        "Renal oncocytoma": 0
    }
    content = value.split(": ")[-1]
    return mapping.get(content, None)

# Saving cohort information
save_cohort_info('GSE19982', './preprocessed/Kidney_Chromophobe/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction (if applicable)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Kidney_Chromophobe', trait_row, convert_trait, age_row, None, gender_row, None)
    csv_path = './preprocessed/Kidney_Chromophobe/trait_data/GSE19982.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM499330': [1], 'GSM499331': [1], 'GSM499332': [1], 'GSM499333': [1], 'GSM499334': [1], 'GSM499335': [1], 'GSM499336': [1], 'GSM499337': [1], 'GSM499338': [1], 'GSM499339': [1], 'GSM499340': [1], 'GSM499341': [1], 'GSM499342': [1], 'GSM499343': [1], 'GSM499344': [1], 'GSM499345': [0], 'GSM499346': [0], 'GSM499347': [0], 'GSM499348': [0], 'GSM499349': [0], 'GSM499350': [0], 'GSM499351': [0], 'GSM499352': [0], 'GSM499353': [0], 'GSM499354': [0], 'GSM499355': [0], 'GSM499356': [0], 'GSM499357': [0], 'GSM499358': [0], 'GSM499359': [0]}
