In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Sickle_Cell_Anemia/GSE84632'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression of peripheral blood mononuclear cells from adults with sickle cell disease (UIC cohort)"
!Series_summary	"Sickle cell disease is associated with systemic complications, many associated with either severity of disease or increased risk of mortality. We sought to identify a circulating gene expression profile whose predictive capacity spanned the spectrum of these poor outcomes in sickle cell disease."
!Series_summary	"The Training cohort consisted of patients with SCD who were prospectively recruited from the University of Illinois. The Testing cohort consisted of a combination of patients prospectively seen at two separate institutions including the University of Chicago and Howard University"
!Series_overall_design	"The gene expression of PBMC from 172 sickle cell disease patients at UIC were analyzed"
Sample Characteristics Dictionary:
{0: ['tissue: peripheral blood'], 1: ['cell type: mononuclear cells'], 2: ['disease: Sickle cel

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

is_gene_available = True  # From the Series title and overall design, this is a gene expression dataset

# The following details are inferred from the Sample Characteristics Dictionary
trait_row = 2  # 'disease: Sickle cell disease'

# No explicit mention of age or gender in the Sample Characteristics Dictionary
age_row = None
gender_row = None

def convert_trait(value):
    # Extract the value after the colon and compare
    real_value = value.split(":")[-1].strip().lower()
    if real_value == "sickle cell disease":
        return 1
    elif real_value == "control" or real_value == "healthy":
        return 0
    else:
        return None

def convert_age(value):
    try:
        # Extract the age which should be a number and convert it to integer
        return int(value.split(":")[-1].strip())
    except ValueError:
        return None

def convert_gender(value):
    real_value = value.split(":")[-1].strip().lower()
    if real_value == "male":
        return 1
    elif real_value == "female":
        return 0
    else:
        return None

# Save the cohort information
save_cohort_info('GSE84632', './preprocessed/Sickle_Cell_Anemia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Sickle_Cell_Anemia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Sickle_Cell_Anemia/trait_data/GSE84632.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2243130': [1], 'GSM2243131': [1], 'GSM2243132': [1], 'GSM2243133': [1], 'GSM2243134': [1], 'GSM2243135': [1], 'GSM2243136': [1], 'GSM2243137': [1], 'GSM2243138': [1], 'GSM2243139': [1], 'GSM2243140': [1], 'GSM2243141': [1], 'GSM2243142': [1], 'GSM2243143': [1], 'GSM2243144': [1], 'GSM2243145': [1], 'GSM2243146': [1], 'GSM2243147': [1], 'GSM2243148': [1], 'GSM2243149': [1], 'GSM2243150': [1], 'GSM2243151': [1], 'GSM2243152': [1], 'GSM2243153': [1], 'GSM2243154': [1], 'GSM2243155': [1], 'GSM2243156': [1], 'GSM2243157': [1], 'GSM2243158': [1], 'GSM2243159': [1], 'GSM2243160': [1], 'GSM2243161': [1], 'GSM2243162': [1], 'GSM2243163': [1], 'GSM2243164': [1], 'GSM2243165': [1], 'GSM2243166': [1], 'GSM2243167': [1], 'GSM2243168': [1], 'GSM2243169': [1], 'GSM2243170': [1], 'GSM2243171': [1], 'GSM2243172': [1], 'GSM2243173': [1], 'GSM2243174': [1], 'GSM2243175': [1], 'GSM2243176': [1], 'GSM2243177': [1], 'GSM2243178': [1], 'GSM2243179': [1], 'GSM2243180': [1], 'GSM2243181': [1], 'GSM2243182

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['16650001', '16650003', '16650005', '16650007', '16650009', '16650011',
       '16650013', '16650015', '16650017', '16650019', '16650021', '16650023',
       '16650025', '16650027', '16650029', '16650031', '16650033', '16650035',
       '16650037', '16650041'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': [12190.0, 29554.0, 69091.0, 160446.0, 317811.0], 'RANGE_END': [13639.0, 31109.0, 70008.0, 161525.0, 328581.0], 'total_probes': [25.0, 28.0, 8.0, 13.0, 36.0], 'GB_ACC': ['NR_046018', nan, nan, nan, 'NR_024368'], 'SPOT_ID': ['chr1:12190-13639', 'chr1:29554-31109', 'chr1:69091-70008', 'chr1:160446-161525', 'chr1:317811-328581'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10']}


### Step 6: Gene Identifier Mapping

In [7]:

# 1. Identify the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'GB_ACC'

# 2. Get the dataframe storing the mapping between probe IDs and genes
probe_gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to get the gene expression data
gene_data = apply_gene_mapping(gene_data, probe_gene_mapping)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Sickle_Cell_Anemia/gene_data/GSE84632.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Sickle_Cell_Anemia')

# 4. Save the cohort information.
save_cohort_info(
    'GSE84632', 
    './preprocessed/Sickle_Cell_Anemia/cohort_info.json', 
    True, 
    True, 
    trait_biased, 
    unbiased_merged_data if not trait_biased else merged_data
)

# 5. If the trait is not severely biased, save the merged data to a CSV file.
if not trait_biased:
    csv_path = './preprocessed/Sickle_Cell_Anemia/GSE84632.csv'
    unbiased_merged_data.to_csv(csv_path)


No gene data in the dataframe
A new JSON file was created at: ./preprocessed/Sickle_Cell_Anemia/cohort_info.json
