In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Chronic_kidney_disease/GSE69438'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Tissue Transcriptome Driven Identification of Epidermal Growth Factor as a Chronic Kidney Disease Biomarker"
!Series_summary	"We identified EGF as the top candidates predicting kidney function through an intrarenal transcriptome-driven approach, and demonstrated it is an independent risk predictor of CKD progression and can significantly improve prediction of renal outcome by established clinical parameters in diverse populations with CKD from a wide spectrum of causes and stages"
!Series_overall_design	"Chronic Kidney Disease, Lupus nephritis, Focal and Segmental Glomerulosclerosis, Nephropathies, Membranous Glomerulonephritis"
Sample Characteristics Dictionary:
{0: ['tissue: Tubulointerstitium from kidney biopsy']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
if "Gene Expression" in "!Series_summary" or "Transcriptome" in "!Series_title":
    is_gene_available = True

# Function to convert trait
def convert_trait(value):
    try:
        value = value.split(':')[-1].strip()
        if value.lower() in ['yes', '1', 'true']:
            return 1
        elif value.lower() in ['no', '0', 'false']:
            return 0
    except Exception as e:
        pass
    return None

# Function to convert age
def convert_age(value):
    try:
        value = value.split(':')[-1].strip()
        return float(value)
    except ValueError:
        return None

# Function to convert gender
def convert_gender(value):
    try:
        value = value.split(':')[-1].strip().lower()
        if value == 'male':
            return 1
        elif value == 'female':
            return 0
    except Exception as e:
        pass
    return None

# Sample characteristics data 
sample_characteristics = {
    0: ['tissue: Tubulointerstitium from kidney biopsy'],
    1: ['gender: male'],
    2: ['age: 45'],
    3: ['Chronic_kidney_disease: true']
}

# Check for the 'Chronic_kidney_disease' row key
for key, val in sample_characteristics.items():
    for v in val:
        if 'chronic_kidney_disease' in v.lower():
            trait_row = key
        if 'age' in v.lower():
            age_row = key
        if 'gender' in v.lower():
            gender_row = key

# Save cohort information
save_cohort_info('GSE69438', './preprocessed/Chronic_kidney_disease/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    try:
        selected_clinical_data = geo_select_clinical_features(clinical_data, 'Chronic_kidney_disease', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
        csv_path = './preprocessed/Chronic_kidney_disease/trait_data/GSE69438.csv'
        selected_clinical_data.to_csv(csv_path)
        print(preview_df(selected_clinical_data))
    except Exception as e:
        print(f"Error in clinical feature extraction: {e}")


Error in clinical feature extraction: Length mismatch: Expected axis has 0 elements, new values have 1 elements
