In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Underweight/GSE119483'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Identification of circulating protein biomarkers for pancreatic cancer cachexia"
!Series_summary	"Over 80% of patients with pancreatic ductal adenocarcinoma (PDAC) suffer from cachexia, characterized by severe muscle and fat loss and yet, there are no biomarkers identified for this debilitating condition. Our objective was to identify circulating protein biomarkers using serum for human PDAC cachexia and understand their biological functions. Serum from 30 patients with PDAC was collected and protein profiles were generated using SOMAscan. The protein profiles were correlated with clinical variables such as Cancer associated weight loss (CAWL), body composition measurements of skeletal muscle index (SMI), skeletal muscle density (SMD), total adipose index (TAI) using Spearman’s correlation. Overall, 110 proteins of 1294 correlated with these clinical measures - 47 proteins for CAWL, 19 for SMI, 14 for SMD, and 30 for TAI (r-value 0.5, p<0.05). LYV

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check gene expression data availability
is_gene_available = False  # Dataset involves protein profiling, not gene expression.

# Identify the rows for each variable
# Underweight is not mentioned, so treat it as unavailable.
trait_row = None
# Age is mentioned under key 2
age_row = 2
# Gender is mentioned under key 1
gender_row = 1

# Define data conversion functions
def convert_trait(value):
    # Underweight data not available, return None
    return None

def convert_age(value):
    try:
        # Extract value after the colon and convert to continuous type
        return float(value.split(':')[-1].strip())
    except ValueError:
        return None

def convert_gender(value):
    # Extract value after the colon
    val = value.split(':')[-1].strip().lower()
    if val == 'male':
        return 1
    elif val == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE119483', './preprocessed/Underweight/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical data extraction omitted since trait_row is None.
