In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Cystic_Fibrosis/GSE107846'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Secondhand smoke alters arachidonic acid metabolism in infants and children with cystic fibrosis"
!Series_summary	"Children ages 0-10 years old with CF were recruited from 2012-2015 at the outpatient CF clinic, and classified according to age (infants <1 year old, vs. children 1-10 years old). The diagnosis of CF was defined as two disease-causing mutations or a sweat chloride test ≥ 60 mmol/L. Hair and blood samples were collected from each subject. Hair nicotine concentrations were determined and considered as the primary objective measure of SHSe.  Hair nicotine provides a long-term measure of SHSe as nicotine is integrated into the growing hair shaft over multiple months. (15)  For each subject, 30-40 shafts of hair of approximately 2-3 cm in length were cut at the hair root from the occipital skull.  Hair samples were refrigerated at 4° for storage, washed before analyses to remove ambient nicotine (15) and batch-tested at a contract research

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
# According to the provided context, we infer gene expression data is available.
is_gene_available = True


# Variable Availability and Data Type Conversion

# For 'Cystic_Fibrosis', the characteristic is represented by the `state` which resides in key 5.
# Other keywords indicating health status like "condition" won't be as specific as "state: Healthy" is found there.
trait_row = 5

# For 'age', the key is identified as key 1, as it directly provides age data.
age_row = 1

# For 'gender', the key is identified as key 2.
gender_row = 2

def convert_trait(value):
    parts = value.split(":")
    if len(parts) == 2:
        status = parts[1].strip().lower()
        if status == 'cf':
            return 1
        elif status == 'healthy':
            return 0
    return None  # In case of any unexpected value

def convert_age(value):
    parts = value.split(":")
    if len(parts) == 2:
        try:
            return float(parts[1].strip())
        except ValueError:
            return None
    return None  # In case of any unexpected format

def convert_gender(value):
    parts = value.split(":")
    if len(parts) == 2:
        gender = parts[1].strip().upper()
        if gender == 'F':
            return 0
        elif gender == 'M':
            return 1
    return None  # In case of any unexpected value

# Save Metadata
save_cohort_info('GSE107846', './preprocessed/Cystic_Fibrosis/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Cystic_Fibrosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Cystic_Fibrosis/trait_data/GSE107846.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2881538': [0.0, 9.0, 0.0], 'GSM2881539': [0.0, 3.8, 0.0], 'GSM2881540': [0.0, 5.1, 1.0], 'GSM2881541': [0.0, 3.4, 0.0], 'GSM2881542': [0.0, 7.0, 1.0], 'GSM2881543': [0.0, 2.8, 1.0], 'GSM2881544': [0.0, 4.3, 0.0], 'GSM2881545': [0.0, 2.3, 0.0], 'GSM2881546': [0.0, 9.9, 0.0], 'GSM2881547': [0.0, 7.8, 1.0], 'GSM2881548': [0.0, 2.3, 1.0], 'GSM2881549': [0.0, 3.4, 0.0], 'GSM2881550': [1.0, 7.25, 0.0], 'GSM2881551': [1.0, 4.0, 1.0], 'GSM2881552': [1.0, 2.333, 1.0], 'GSM2881553': [1.0, 1.917, 0.0], 'GSM2881554': [1.0, 8.583, 1.0], 'GSM2881555': [1.0, 6.8, 0.0], 'GSM2881556': [1.0, 2.667, 0.0], 'GSM2881557': [1.0, 9.917, 0.0], 'GSM2881558': [1.0, 1.083, 0.0], 'GSM2881559': [1.0, 2.25, 1.0], 'GSM2881560': [1.0, 7.75, 1.0], 'GSM2881561': [1.0, 6.833, 1.0], 'GSM2881562': [1.0, 4.583, 1.0], 'GSM2881563': [1.0, 6.417, 0.0], 'GSM2881564': [1.0, 7.0, 0.0], 'GSM2881565': [1.0, 4.75, 1.0], 'GSM2881566': [1.0, 4.333, 0.0], 'GSM2881567': [1.0, 5.25, 1.0], 'GSM2881568': [1.0, 4.25, 1.0], 'GSM2881569'

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651199', 'ILMN_1651209',
       'ILMN_1651210', 'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651230', 'ILMN_1651232', 'ILMN_1651235', 'ILMN_1651236',
       'ILMN_1651237', 'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253',
       'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651262'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_1762337', 'ILMN_2055271', 'ILMN_1736007', 'ILMN_2383229', 'ILMN_1806310'], 'SPECIES': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'SOURCE': ['RefSeq', 'RefSeq', 'RefSeq', 'RefSeq', 'RefSeq'], 'SEARCH_KEY': ['NM_182762.2', 'NM_130786.2', 'NM_130786.2', 'NM_138932.1', 'NM_138933.1'], 'TRANSCRIPT': ['ILMN_183371', 'ILMN_175569', 'ILMN_18893', 'ILMN_18532', 'ILMN_7300'], 'ILMN_GENE': ['7A5', 'A1BG', 'A1BG', 'A1CF', 'A1CF'], 'SOURCE_REFERENCE_ID': ['NM_182762.2', 'NM_130786.2', 'NM_130786.2', 'NM_138932.1', 'NM_014576.2'], 'REFSEQ_ID': ['NM_182762.2', 'NM_130786.2', 'NM_130786.2', 'NM_138932.1', 'NM_014576.2'], 'UNIGENE_ID': [nan, nan, nan, nan, nan], 'ENTREZ_GENE_ID': [346389.0, 1.0, 1.0, 29974.0, 29974.0], 'GI': [47271497.0, 21071029.0, 21071029.0, 20357574.0, 20357571.0], 'ACCESSION': ['NM_182762.2', 'NM_130786.2', 'NM_130786.2', 'NM_138932.1', 'NM_014576.2'], 'SYMBOL': ['7A5', 'A1BG', 'A1BG', 'A1CF', 'A1CF'], '

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Mapping identifiers to gene symbols
identifier_key = 'ID'
gene_symbol_key = 'SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function from the library.
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library, and name the resulting gene expression dataframe "gene_data".
gene_data = apply_gene_mapping(gene_data, gene_mapping)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Cystic_Fibrosis/gene_data/GSE107846.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Cystic_Fibrosis')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE107846', './preprocessed/Cystic_Fibrosis/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Cystic_Fibrosis/GSE107846.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Cystic_Fibrosis', the least common label is '0.0' with 12 occurrences. This represents 30.00% of the dataset.
The distribution of the feature 'Cystic_Fibrosis' in this dataset is fine.

Quartiles for 'Age':
  25%: 3.25
  50% (Median): 4.7085
  75%: 6.80825
Min: 1.083
Max: 9.917
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 20 occurrences. This represents 50.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

A new JSON file was created at: ./preprocessed/Cystic_Fibrosis/cohort_info.json
