In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lower_Grade_Glioma/GSE145510'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Urinary MicroRNA-based Diagnostic Model for Central Nervous System Tumors Using Nanowire Scaffolds"
!Series_summary	"A urinary miRNA combination could be a powerful classifier for the detection of patients with brain tumors."
!Series_overall_design	"Urinary microRNA profiles of glioblastomas, lower grade gliomas, meningiomas, other brain tumors, and collected non-cancer controls."
Sample Characteristics Dictionary:
{0: ['disease: Glioblastoma', 'disease: Lower grade glioma', 'disease: Meningioma', 'disease: Other brain tumor', 'disease: Non-cancer control'], 1: ['gender: Male', 'gender: Female'], 2: ['age: 29', 'age: 44', 'age: 54', 'age: 31', 'age: 86', 'age: 46', 'age: 81', 'age: 42', 'age: 32', 'age: 73', 'age: 50', 'age: 53', 'age: 70', 'age: 60', 'age: 56', 'age: 43', 'age: 67', 'age: 74', 'age: 58', 'age: 64', 'age: 35', 'age: 71', 'age: 39', 'age: 69', 'age: 26', 'age: 30', 'age: 49', 'age: 27', 'age: 62', 'age: 68']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
is_gene_available = False  # Based on the Series_title and summary, it's a miRNA dataset

# 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability
sample_characteristics = {
    0: ['disease: Glioblastoma', 'disease: Lower grade glioma', 'disease: Meningioma', 'disease: Other brain tumor', 'disease: Non-cancer control'],
    1: ['gender: Male', 'gender: Female'],
    2: ['age: 29', 'age: 44', 'age: 54', 'age: 31', 'age: 86', 'age: 46', 'age: 81', 'age: 42', 'age: 32', 'age: 73', 'age: 50', 'age: 53', 'age: 70', 'age: 60', 'age: 56', 'age: 43', 'age: 67', 'age: 74', 'age: 58', 'age: 64', 'age: 35', 'age: 71', 'age: 39', 'age: 69', 'age: 26', 'age: 30', 'age: 49', 'age: 27', 'age: 62', 'age: 68']
}

# Define variable keys if data is available
if len(set([value.split(': ')[1] for value in sample_characteristics[0]])) > 1:
    trait_row = 0
if len(set([value.split(': ')[1] for value in sample_characteristics[1]])) > 1:
    gender_row = 1
if len(set([value.split(': ')[1] for value in sample_characteristics[2]])) > 1:
    age_row = 2

# 2.3 Data Type Conversion
def convert_trait(value):
    if value.split(': ')[1] == "Lower grade glioma":
        return 1
    return 0

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender = value.split(': ')[1].lower()
    if gender == "female":
        return 0
    elif gender == "male":
        return 1
    return None

# 3. Save Metadata
save_cohort_info('GSE145510', './preprocessed/Lower_Grade_Glioma/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Lower_Grade_Glioma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Lower_Grade_Glioma/trait_data/GSE145510.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Lower_Grade_Glioma/cohort_info.json
{'GSM4319753': [0.0, 29.0, 1.0], 'GSM4319754': [0.0, 29.0, 1.0], 'GSM4319755': [0.0, 44.0, 1.0], 'GSM4319756': [0.0, 54.0, 0.0], 'GSM4319757': [0.0, 31.0, 0.0], 'GSM4319758': [0.0, 86.0, 0.0], 'GSM4319759': [0.0, 46.0, 1.0], 'GSM4319760': [0.0, 81.0, 1.0], 'GSM4319761': [0.0, 42.0, 1.0], 'GSM4319762': [0.0, 32.0, 1.0], 'GSM4319763': [0.0, 73.0, 1.0], 'GSM4319764': [0.0, 50.0, 0.0], 'GSM4319765': [0.0, 53.0, 1.0], 'GSM4319766': [0.0, 54.0, 1.0], 'GSM4319767': [0.0, 70.0, 0.0], 'GSM4319768': [0.0, 60.0, 0.0], 'GSM4319769': [0.0, 70.0, 1.0], 'GSM4319770': [0.0, 56.0, 1.0], 'GSM4319771': [0.0, 43.0, 1.0], 'GSM4319772': [0.0, 53.0, 1.0], 'GSM4319773': [0.0, 70.0, 0.0], 'GSM4319774': [0.0, 50.0, 1.0], 'GSM4319775': [0.0, 67.0, 1.0], 'GSM4319776': [0.0, 74.0, 0.0], 'GSM4319777': [0.0, 58.0, 0.0], 'GSM4319778': [0.0, 70.0, 1.0], 'GSM4319779': [0.0, 64.0, 0.0], 'GSM4319780': [0.0, 35.0, 0.0], 'GSM4319781': [1.0, 