In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Crohns_Disease/GSE207022'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Efficacy and safety of ustekinumab treatment in patients with Crohn's disease"
!Series_summary	"UNITI-2 was a phase 3 clinical trial (ClinicalTrials.gov Identifier: NCT01369342) comparing the effects (both positive and negative) of an initial treatment with ustekinumab to a placebo over 8 weeks in patients with moderately to severely active Crohn's disease."
!Series_overall_design	"A gene expression profiling study was conducted in which rectum biopsy samples were collected for RNA extraction and hybridization to microarrays from patients (n=125) with moderate-to-severe Crohn's disease and from non-IBD subjects (n=23)."
Sample Characteristics Dictionary:
{0: ['tissue: rectum'], 1: ['donor id: CNTO1275CRD3002-20554', 'donor id: CNTO1275CRD3002-20667', 'donor id: CNTO1275CRD3002-20449', 'donor id: CNTO1275CRD3002-20927', 'donor id: CNTO1275CRD3002-20270', 'donor id: CNTO1275CRD3002-20072', 'donor id: CNTO1275CRD3002-20109', 'donor id: CNTO1275CRD300

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
is_gene_available = True  # Based on the gene expression profiling study mentioned in the summary

# Determine the availability and key for each variable
sample_characteristics_dict = {
    0: ['tissue: rectum'],
    1: ['donor id: CNTO1275CRD3002-20554', 'donor id: CNTO1275CRD3002-20667', 'donor id: CNTO1275CRD3002-20449', 'donor id: CNTO1275CRD3002-20927', 'donor id: CNTO1275CRD3002-20270', 'donor id: CNTO1275CRD3002-20072', 'donor id: CNTO1275CRD3002-20109', 'donor id: CNTO1275CRD3002-20346', 'donor id: HC-1', 'donor id: HC-2', 'donor id: HC-3', 'donor id: HC-4', 'donor id: HC-5', 'donor id: HC-6', 'donor id: HC-7', 'donor id: HC-8', 'donor id: HC-9', 'donor id: HC-10', 'donor id: HC-11', 'donor id: HC-12', 'donor id: HC-13', 'donor id: HC-14', 'donor id: HC-15', 'donor id: HC-16', 'donor id: HC-17', 'donor id: HC-18', 'donor id: HC-19', 'donor id: HC-20', 'donor id: HC-21', 'donor id: HC-22'],
    2: ['visit: I-WK0'],
    3: ["diagnosis: Crohn's disease", 'diagnosis: healthy control'],
    4: ['treatment: Ustekinumab 130 mg IV', 'treatment: Ustekinumab 6 mg/kg (520 mg)', 'treatment: Placebo IV', 'treatment: Ustekinumab 6 mg/kg (390 mg)', 'treatment: NA', 'treatment: Ustekinumab 6 mg/kg (260 mg)'],
    5: ['inflamed area at week 0: Ileum and colon', 'inflamed area at week 0: Colon only', 'inflamed area at week 0: NA'],
    6: ['mucosal healing at week 8: N', 'mucosal healing at week 8: NA', 'mucosal healing at week 8: Y']
}

# Determine keys for each variable
if len(set("diagnosis: Crohn's disease" == s for s in sample_characteristics_dict[3])) > 1:
    trait_row = 3
# Age data seems to be unavailable
age_row = None
# Gender data seems to be unavailable
gender_row = None

# Define conversion functions
def convert_trait(value):
    if "Crohn's disease" in value:
        return 1
    elif "healthy control" in value:
        return 0
    return None

def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except:
        return None

def convert_gender(value):
    gender_str = value.split(":")[1].strip().lower()
    if gender_str == 'male':
        return 1
    elif gender_str == 'female':
        return 0
    return None

# Save cohort information
save_cohort_info('GSE207022', './preprocessed/Crohns_Disease/cohort_info.json', is_gene_available, trait_row is not None)

if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Crohns_Disease', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Crohns_Disease/trait_data/GSE207022.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6268367': [1], 'GSM6268368': [1], 'GSM6268369': [1], 'GSM6268370': [1], 'GSM6268371': [1], 'GSM6268372': [1], 'GSM6268373': [1], 'GSM6268374': [1], 'GSM6268375': [0], 'GSM6268376': [0], 'GSM6268377': [0], 'GSM6268378': [0], 'GSM6268379': [0], 'GSM6268380': [0], 'GSM6268381': [0], 'GSM6268382': [0], 'GSM6268383': [0], 'GSM6268384': [0], 'GSM6268385': [0], 'GSM6268386': [0], 'GSM6268387': [0], 'GSM6268388': [0], 'GSM6268389': [0], 'GSM6268390': [0], 'GSM6268391': [0], 'GSM6268392': [0], 'GSM6268393': [0], 'GSM6268394': [0], 'GSM6268395': [0], 'GSM6268396': [0], 'GSM6268397': [0], 'GSM6268398': [1], 'GSM6268399': [1], 'GSM6268400': [1], 'GSM6268401': [1], 'GSM6268402': [1], 'GSM6268403': [1], 'GSM6268404': [1], 'GSM6268405': [1], 'GSM6268406': [1], 'GSM6268407': [1], 'GSM6268408': [1], 'GSM6268409': [1], 'GSM6268410': [1], 'GSM6268411': [1], 'GSM6268412': [1], 'GSM6268413': [1], 'GSM6268414': [1], 'GSM6268415': [1], 'GSM6268416': [1], 'GSM6268417': [1], 'GSM6268418': [1], 'GSM6268419

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1007_PM_s_at', '1053_PM_at', '117_PM_at', '121_PM_at', '1255_PM_g_at',
       '1294_PM_at', '1316_PM_at', '1320_PM_at', '1405_PM_i_at', '1431_PM_at',
       '1438_PM_at', '1487_PM_at', '1494_PM_f_at', '1552256_PM_a_at',
       '1552257_PM_a_at', '1552258_PM_at', '1552261_PM_at', '1552263_PM_at',
       '1552264_PM_a_at', '1552266_PM_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1007_PM_s_at', '1053_PM_at', '117_PM_at', '121_PM_at', '1255_PM_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010', 'Aug 20, 2010'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X

### Step 6: Gene Identifier Mapping

In [7]:
if requires_gene_mapping:
    # 1. Assign keys for identifiers and gene symbols
    identifier_key = 'ID'
    gene_symbol_key = 'Gene Symbol'

    # 2. Get the dataframe storing the mapping between probe IDs and genes
    gene_mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

    # 3. Apply the mapping to get the gene expression dataframe
    gene_data = apply_gene_mapping(gene_data, gene_mapping_df)

    # Print a preview of the gene_data
    print("Gene data preview:")
    print(preview_df(gene_data))


Gene data preview:
{'GSM6268367': [4.53, 6.22, 4.37, 2.83, 3.13], 'GSM6268368': [7.35, 6.5, 4.86, 2.6950000000000003, 3.0949999999999998], 'GSM6268369': [5.81, 6.46, 4.89, 2.7800000000000002, 3.0999999999999996], 'GSM6268370': [8.19, 6.98, 5.43, 3.01, 3.1799999999999997], 'GSM6268371': [8.16, 7.16, 5.34, 2.865, 3.2750000000000004], 'GSM6268372': [5.62, 6.57, 5.18, 2.89, 3.355], 'GSM6268373': [7.05, 7.33, 4.85, 3.035, 3.3200000000000003], 'GSM6268374': [6.7, 7.02, 4.58, 3.04, 3.4050000000000002], 'GSM6268375': [7.69, 6.78, 5.08, 2.75, 3.24], 'GSM6268376': [7.82, 6.23, 5.54, 3.105, 3.3], 'GSM6268377': [7.63, 6.74, 5.17, 3.0650000000000004, 3.2750000000000004], 'GSM6268378': [8.18, 7.02, 4.77, 3.085, 3.51], 'GSM6268379': [7.3, 7.19, 5.38, 3.0, 3.12], 'GSM6268380': [7.62, 6.61, 5.23, 2.9, 3.205], 'GSM6268381': [7.47, 7.1, 5.2, 3.1550000000000002, 3.38], 'GSM6268382': [7.01, 6.13, 4.79, 3.17, 3.4050000000000002], 'GSM6268383': [7.13, 6.41, 5.23, 3.135, 3.195], 'GSM6268384': [7.31, 6.7, 5.02

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Crohns_Disease/gene_data/GSE207022.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Crohns_Disease')

# 4. Save the cohort information.
save_cohort_info('GSE207022', './preprocessed/Crohns_Disease/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Crohns_Disease/GSE207022.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Crohns_Disease', the least common label is '0.0' with 23 occurrences. This represents 15.54% of the dataset.
The distribution of the feature 'Crohns_Disease' in this dataset is fine.

