In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Insomnia/GSE208668'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Sleep Disturbance and Activation of Cellular and Transcriptional Mechanisms of Inflammation in Older Adults"
!Series_summary	"Genome-wide transcriptional profiling results were used to systematically assess the extent to which transcriptomes of older adults with insomnia show expression of genes that are different from those without insomnia"
!Series_overall_design	"Total RNA obtained from peripheral blood mononuclear cells (PBMCs) of older adults with insomnia disorder who participated in the Behavioral Treatment of Insomnia in Aging study (n = 17) and older adults without insomnia disorder who participated in the Sleep Health and Aging Research (SHARE) study (n = 25) at UCLA."
!Series_overall_design	""
!Series_overall_design	"**Please note that raw data was lost and thus is not included in the records**"
Sample Characteristics Dictionary:
{0: ['insomnia: yes', 'insomnia: no'], 1: ['age: 65', 'age: 75', 'age: 77', 'age: 64', 'age: 60', 'age: 67',

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True

# Check for variable availability
trait_row = 0 if 'insomnia: yes' in [item.split(': ')[0] for item in sample_characteristics_dict[0]] and 'insomnia: no' in [item.split(': ')[0] for item in sample_characteristics_dict[0]] else None
age_row = 1 if all('age: ' in item for item in sample_characteristics_dict[1]) else None
gender_row = 2 if 'gender: female' in sample_characteristics_dict[2] and 'gender: male' in sample_characteristics_dict[2] else None

# Define data type conversion functions
import pandas as pd
import numpy as np

def convert_trait(value):
    value = value.split(': ')[1]
    if value == 'yes':
        return 1
    elif value == 'no':
        return 0
    return None

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except ValueError:
        return None

def convert_gender(value):
    value = value.split(': ')[1]
    if value.lower() == 'female':
        return 0
    elif value.lower() == 'male':
        return 1
    return None

# Save metadata
save_cohort_info('GSE208668', './preprocessed/Insomnia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Insomnia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Insomnia/trait_data/GSE208668.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Insomnia/cohort_info.json
