In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/von_Willebrand_Disease/GSE27597'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A gene expression signature of emphysema-related lung destruction and its reversal by the tripeptide GHK."
!Series_summary	"BACKGROUND: Chronic obstructive pulmonary disease (COPD) is a heterogeneous disease consisting of emphysema, small airway obstruction, and/or chronic bronchitis that results in significant loss of lung function over time. METHODS: In order to gain insights into the molecular pathways underlying progression of emphysema and explore computational strategies for identifying COPD therapeutics, we profiled gene expression in lung tissue samples obtained from regions within the same lung with varying amounts of emphysematous destruction from smokers with COPD (8 regions x 8 lungs = 64 samples). Regional emphysema severity was quantified in each tissue sample using the mean linear intercept (Lm) between alveolar walls from micro-CT scans. RESULTS: We identified 127 genes whose expression levels were significantly associated with reg

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Gene Expression Data Availability
is_gene_available = True  # Based on the Affymetrix Human Exon 1.0 ST GeneChip mentioned

# Step 2: Variable Availability and Data Type Conversion

# Data Availability
trait_row = None  # von_Willebrand_Disease not explicitly mentioned in sample characteristics
age_row = 5  # Age is found in key 5
gender_row = 4  # Gender is found in key 4

# Data Type Conversion
from typing import Any, Union

def convert_age(value: str) -> Union[float, None]:
    try:
        return float(value.split(':')[-1].strip())
    except ValueError:
        return None

def convert_gender(value: str) -> Union[int, None]:
    gender = value.split(':')[-1].strip().lower()
    if gender == 'male':
        return 1
    elif gender == 'female':
        return 0
    else:
        return None

# Save Metadata
save_cohort_info('GSE27597', './preprocessed/von_Willebrand_Disease/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'von_Willebrand_Disease', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/von_Willebrand_Disease/trait_data/GSE27597.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/von_Willebrand_Disease/cohort_info.json
