In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Sickle_Cell_Anemia/GSE17078'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Cell Adhesion Molecule 1 (CADM1): A Novel Risk Factor for Venous Thrombosis"
!Series_summary	"Protein C (PC) deficiency increases the risk of venous thrombosis (VT) among members of Kindred Vermont II, but fails to fully account for the inheritance pattern. A genome scan of the pedigree supported the presence of a prothrombotic gene on chromosome 11q23 with weaker support on chromosomes 10p12 and 18p11.2-q11."
!Series_summary	"Preliminary data from Affimetrix microarray expression analysis of Blood Outgrowth Endothelial Cells of 3 members of Kindred Vermont II compared to a well established normal control group indicated that IgsF4 was decreased in patients versus controls. In addition, both statistical and pathway analysis results suggested that these genes are associated protein C.  Further studies indicated that Cell Adhesion Molecule 1 (CADM1), a member of the IgsF4 superfamily, may be associated with VT."
!Series_overall_design	"We obtained B

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking Gene Expression Data Availability
# Check the available information for genes, miRNA, or methylation data
is_gene_available = True  # Based on the background info, it looks like gene expression data is available.

# Variable Availability and Data Type Conversion
sample_characteristics = {
    0: ['cell type: blood outgrowth endothelial cell'],
    1: ['status: normal', 'vt sample: 1', 'vt sample: 2', 'vt sample: 3'],
    2: ['race: African American', 'race: Caucasian', 'status: Protein C Deficiency'],
    3: ['age: 50', 'age: 60', 'age: 53', 'age: 40', 'age: 41', 'age: 33', 'age: 23', 'age: 46', 'age: 31', 'age: 63', 'age: 57', 'age: 29', 'age: 44', 'age: 27', 'age: 55', 'age: 35', 'age: 56', 'age: 61', 'age: 74'],
    4: ['sex: F', 'sex: M', 'Sex: F']
}

# Check for 'Sickle_Cell_Anemia', 'age', and 'gender' keys
trait_row = None  # Unavailable
age_row = 3  # Available
gender_row = 4  # Available

# Data type conversion functions
def extract_value(value):
    return value.split(':')[1].strip() if isinstance(value, str) and ':' in value else None

def convert_trait(value):
    return None  # Since the trait data is unavailable.

def convert_age(value):
    val = extract_value(value)
    try:
        return float(val)
    except:
        return None

def convert_gender(value):
    val = extract_value(value)
    if val is not None:
        val = val.lower()
        if val in ['f', 'female']:
            return 0
        elif val in ['m', 'male']:
            return 1
    return None

save_cohort_info('GSE17078', './preprocessed/Sickle_Cell_Anemia/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Sickle_Cell_Anemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Sickle_Cell_Anemia/trait_data/GSE17078.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
