In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Irritable_bowel_syndrome_(IBS)/GSE20881'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Colon biopsies from Crohns patients and healthy controls"
!Series_summary	"The aim of this study was to investigate differential intestinal gene expression in patients with Crohn's disease (CD) and controls.  172 biopsies from CD and control subjects were studied.  Endoscopic biopsies were taken at ileocolonoscopy from five specific anatomical locations including the terminal ileum for RNA extraction."
!Series_overall_design	"Biopsies from one of four anatomic locations, from healthy controls and treated (with non-biologic standard of care) or untreated CD patients."
Sample Characteristics Dictionary:
{0: ['patient: 101', 'patient: 102', 'patient: 103', 'patient: 105', 'patient: 106', 'patient: 108', 'patient: 104', 'patient: 107', 'patient: 109', 'patient: 110', 'patient: 111', 'patient: 112', 'patient: 113', 'patient: 114', 'patient: 115', 'patient: 116', 'patient: 117', 'patient: 119', 'patient: 120', 'patient: 121', 'patient: 122', 'patient: 1

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
is_gene_available = True  # Assuming from context that gene expression data is present

# Determine the keys for each variable
trait_row = 57  # Based on the "other illnesses" which lists IBS in some samples
age_row = 2    # Based on "birth date"
gender_row = None  # No explicit gender information provided

# Define the conversion functions
def extract_value(cell):
    try:
        return cell.split(": ")[1]
    except IndexError:
        return None

def convert_trait(value):
    value = extract_value(value)
    if value and "Irritable bowel syndrome" in value:
        return 1
    return None

def convert_age(value):
    value = extract_value(value)
    try:
        birth_year = int(value.split("/")[-1])
        age = 2005 - birth_year  # Assuming procedure or data collection happened in 2005
        return age
    except (ValueError, IndexError):
        return None

convert_gender = None  # No gender data to convert

# Save cohort information
save_cohort_info('GSE20881', './preprocessed/Irritable_bowel_syndrome_(IBS)/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Irritable_bowel_syndrome_(IBS)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Irritable_bowel_syndrome_(IBS)/trait_data/GSE20881.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM522094': [None, 1942], 'GSM522095': [None, 1943], 'GSM522096': [None, 1940], 'GSM522097': [None, 1921], 'GSM522098': [None, 1921], 'GSM522099': [None, 1918], 'GSM522100': [None, 1972], 'GSM522101': [None, 1933], 'GSM522102': [None, 945], 'GSM522103': [None, 1948], 'GSM522104': [None, 1951], 'GSM522105': [None, 1923], 'GSM522106': [None, 1924], 'GSM522107': [None, 1924], 'GSM522108': [None, 1969], 'GSM522109': [None, 1977], 'GSM522110': [None, 1949], 'GSM522111': [None, 1938], 'GSM522112': [None, 1942], 'GSM522113': [None, 1933], 'GSM522114': [None, 1951], 'GSM522115': [None, 1942], 'GSM522116': [None, 1940], 'GSM522117': [None, 1933], 'GSM522118': [None, 1921], 'GSM522119': [None, 1918], 'GSM522120': [None, 945], 'GSM522121': [None, 1943], 'GSM522122': [None, 1948], 'GSM522123': [None, 1951], 'GSM522124': [None, 1923], 'GSM522125': [None, 1924], 'GSM522126': [None, 1969], 'GSM522127': [None, 1977], 'GSM522128': [None, 1949], 'GSM522129': [None, 1938], 'GSM522130': [None, 1940], 'G

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])
    

# Based on the row headers provided, it appears they are numerical and not human gene symbols.


Index(['1', '3', '4', '5', '6', '9', '10', '12', '13', '17', '18', '19', '20',
       '23', '24', '25', '26', '27', '29', '31'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1', '2', '3', '4', '5'], 'COL': ['103', '103', '103', '103', '103'], 'ROW': [430.0, 428.0, 426.0, 424.0, 422.0], 'NAME': ['BrightCorner', 'NegativeControl', 'NM_001003689', 'NM_005503', 'NM_004672'], 'SPOT_ID': ['BrightCorner', 'NegativeControl', 'A_23_P80353', 'A_23_P158231', 'A_32_P223017'], 'CONTROL_TYPE': ['pos', 'neg', 'FALSE', 'FALSE', 'FALSE'], 'REFSEQ': [nan, nan, 'NM_001003689', 'NM_005503', 'NM_004672'], 'GB_ACC': [nan, nan, 'NM_001003689', 'NM_005503', 'NM_004672'], 'GENE': [nan, nan, 83746.0, 321.0, 9064.0], 'GENE_SYMBOL': [nan, nan, 'L3MBTL2', 'APBA2', 'MAP3K6'], 'GENE_NAME': [nan, nan, 'l(3)mbt-like 2 (Drosophila)', 'amyloid beta (A4) precursor protein-binding, family A, member 2 (X11-like)', 'mitogen-activated protein kinase kinase kinase 6'], 'UNIGENE_ID': [nan, nan, 'Hs.517641', 'Hs.525718', 'Hs.194694'], 'ENSEMBL_ID': [nan, nan, 'ENST00000216237', 'ENST00000219865', 'ENST00000357582'], 'TIGR_ID': [nan, nan, 'THC2264916', 'THC2241506',

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the key storing the same kind of identifiers as in STEP3 and the key storing the gene symbols.
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and gene symbols using the 'get_gene_mapping' function from the library.
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library, and name the resulting gene expression dataframe 'gene_data'.
gene_data = apply_gene_mapping(gene_data, gene_mapping)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Irritable_bowel_syndrome_(IBS)/gene_data/GSE20881.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Irritable_bowel_syndrome_(IBS)')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE20881', './preprocessed/Irritable_bowel_syndrome_(IBS)/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Irritable_bowel_syndrome_(IBS)/GSE20881.csv'
    unbiased_merged_data.to_csv(csv_path)


Quartiles for 'Irritable_bowel_syndrome_(IBS)':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Irritable_bowel_syndrome_(IBS)' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 1928.0
  50% (Median): 1928.0
  75%: 1928.0
Min: 1928.0
Max: 1928.0
The distribution of the feature 'Age' in this dataset is severely biased.

