In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Depression/GSE208668'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Sleep Disturbance and Activation of Cellular and Transcriptional Mechanisms of Inflammation in Older Adults"
!Series_summary	"Genome-wide transcriptional profiling results were used to systematically assess the extent to which transcriptomes of older adults with insomnia show expression of genes that are different from those without insomnia"
!Series_overall_design	"Total RNA obtained from peripheral blood mononuclear cells (PBMCs) of older adults with insomnia disorder who participated in the Behavioral Treatment of Insomnia in Aging study (n = 17) and older adults without insomnia disorder who participated in the Sleep Health and Aging Research (SHARE) study (n = 25) at UCLA."
!Series_overall_design	""
!Series_overall_design	"**Please note that raw data was lost and thus is not included in the records**"
Sample Characteristics Dictionary:
{0: ['insomnia: yes', 'insomnia: no'], 1: ['age: 65', 'age: 75', 'age: 77', 'age: 64', 'age: 60', 'age: 67',

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
if 'Genome-wide transcriptional profiling' in '!Series_summary':
    is_gene_available = True

# Check for the availability of variables
def find_variable_key(characteristics_dict, search_terms):
    for key, values in characteristics_dict.items():
        for value in values:
            if any(term in value for term in search_terms):
                return key
    return None

trait_row = find_variable_key(sample_characteristics_dict, ['depression', 'history of depression'])
age_row = find_variable_key(sample_characteristics_dict, ['age'])
gender_row = find_variable_key(sample_characteristics_dict, ['gender'])

# Data type conversion functions
def extract_value(cell):
    return cell.split(":")[1].strip() if ":" in cell else None

def convert_trait(value):
    val = extract_value(value)
    if val is None:
        return None
    elif val.lower() == 'yes':
        return 1
    elif val.lower() == 'no':
        return 0
    return None

def convert_age(value):
    val = extract_value(value)
    return float(val) if val is not None else None

def convert_gender(value):
    val = extract_value(value)
    if val is None:
        return None
    elif val.lower() == 'female':
        return 0
    elif val.lower() == 'male':
        return 1
    return None

# Save Metadata
save_cohort_info('GSE208668', './preprocessed/Depression/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Depression', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Depression/trait_data/GSE208668.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6360934': [1.0, 65.0, 0.0], 'GSM6360935': [0.0, 75.0, 1.0], 'GSM6360936': [1.0, 77.0, 0.0], 'GSM6360937': [0.0, 64.0, 0.0], 'GSM6360938': [1.0, 60.0, 1.0], 'GSM6360939': [1.0, 67.0, 0.0], 'GSM6360940': [1.0, 72.0, 1.0], 'GSM6360941': [0.0, 62.0, 1.0], 'GSM6360942': [0.0, 73.0, 0.0], 'GSM6360943': [0.0, 74.0, 1.0], 'GSM6360944': [0.0, 73.0, 1.0], 'GSM6360945': [0.0, 68.0, 0.0], 'GSM6360946': [0.0, 62.0, 0.0], 'GSM6360947': [1.0, 73.0, 0.0], 'GSM6360948': [0.0, 70.0, 0.0], 'GSM6360949': [0.0, 60.0, 0.0], 'GSM6360950': [1.0, 61.0, 0.0], 'GSM6360951': [0.0, 66.0, 0.0], 'GSM6360952': [0.0, 69.0, 0.0], 'GSM6360953': [0.0, 62.0, 1.0], 'GSM6360954': [1.0, 67.0, 0.0], 'GSM6360955': [1.0, 62.0, 0.0], 'GSM6360956': [0.0, 71.0, 1.0], 'GSM6360957': [0.0, 63.0, 1.0], 'GSM6360958': [1.0, 62.0, 1.0], 'GSM6360959': [0.0, 61.0, 0.0], 'GSM6360960': [1.0, 67.0, 0.0], 'GSM6360961': [0.0, 78.0, 0.0], 'GSM6360962': [1.0, 79.0, 1.0], 'GSM6360963': [0.0, 72.0, 0.0], 'GSM6360964': [0.0, 73.0, 0.0], 'GSM636