In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Irritable_bowel_syndrome_(IBS)/GSE63379'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genome-wide Expression Profiling in Irritable Bowel Syndrome"
!Series_summary	"Differential gene expression profiling in peripheral blood mononuclear cells (PBMCs) was performed using Human Transcriptome Array 2 (HTA2)"
!Series_overall_design	"Expression profiles of peripheral blood mononuclear cell (PBMCs) from 35 IBS samples and 32 healthy control was assessed."
Sample Characteristics Dictionary:
{0: ['disease status: healthy', 'disease status: IBS'], 1: ['tissue: blood'], 2: ['cell type: peripheral blood mononuclear cells']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check for gene expression data availability
is_gene_available = True

# Step 2: Variable Availability and Data Type Conversion

# Sample Characteristics shows 'disease status' might relate to IBS
trait_row = 0 if len(set(['healthy', 'IBS'])) > 1 else None

# No specific row indicates age data
age_row = None

# No specific row indicates gender data
gender_row = None

# Define conversion function for trait
def convert_trait(value):
    value = value.split(':')[1].strip()
    if value.lower() == "ibs":
        return 1
    elif value.lower() == "healthy":
        return 0
    return None

# Age and gender conversion functions not defined since their data is unavailable
convert_age = None 
convert_gender = None

# Save Metadata
save_cohort_info('GSE63379', './preprocessed/Irritable_bowel_syndrome_(IBS)/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Irritable_bowel_syndrome_(IBS)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Irritable_bowel_syndrome_(IBS)/trait_data/GSE63379.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1547708': [0], 'GSM1547709': [0], 'GSM1547710': [0], 'GSM1547711': [0], 'GSM1547712': [0], 'GSM1547713': [0], 'GSM1547714': [0], 'GSM1547715': [0], 'GSM1547716': [0], 'GSM1547717': [0], 'GSM1547718': [0], 'GSM1547719': [0], 'GSM1547720': [0], 'GSM1547721': [0], 'GSM1547722': [0], 'GSM1547723': [0], 'GSM1547724': [0], 'GSM1547725': [0], 'GSM1547726': [0], 'GSM1547727': [0], 'GSM1547728': [0], 'GSM1547729': [0], 'GSM1547730': [0], 'GSM1547731': [0], 'GSM1547732': [0], 'GSM1547733': [0], 'GSM1547734': [0], 'GSM1547735': [0], 'GSM1547736': [0], 'GSM1547737': [0], 'GSM1547738': [0], 'GSM1547739': [0], 'GSM1547740': [1], 'GSM1547741': [1], 'GSM1547742': [1], 'GSM1547743': [1], 'GSM1547744': [1], 'GSM1547745': [1], 'GSM1547746': [1], 'GSM1547747': [1], 'GSM1547748': [1], 'GSM1547749': [1], 'GSM1547750': [1], 'GSM1547751': [1], 'GSM1547752': [1], 'GSM1547753': [1], 'GSM1547754': [1], 'GSM1547755': [1], 'GSM1547756': [1], 'GSM1547757': [1], 'GSM1547758': [1], 'GSM1547759': [1], 'GSM1547760

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['2824546_st', '2824549_st', '2824551_st', '2824554_st', '2827992_st',
       '2827995_st', '2827996_st', '2828010_st', '2828012_st', '2835442_st',
       '2835447_st', '2835453_st', '2835456_st', '2835459_st', '2835461_st',
       '2839509_st', '2839511_st', '2839513_st', '2839515_st', '2839517_st'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'probeset_id': ['TC01000001.hg.1', 'TC01000002.hg.1', 'TC01000003.hg.1', 'TC01000004.hg.1', 'TC01000005.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['11869', '29554', '69091', '160446', '317811'], 'stop': ['14409', '31109', '70008', '161525', '328581'], 'total_probes': [49.0, 60.0, 30.0, 30.0, 191.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000456328 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102', 'ENST00000408384 // MIR1302-11 // microRNA 1302-11 // --- // 100422919 /// ENST00000408384 // MIR1302-10 // microRNA 1302-10 // --- // 100422834 /// ENST0000040838

### Step 6: Gene Identifier Mapping

In [7]:
# 1: Identifying the keys in the annotation data based on STEP3 and STEP5 outputs
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2: Get the dataframe storing the mapping between probe IDs and genes
gene_mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3: Apply the mapping with the 'apply_gene_mapping' function from the library
gene_data = apply_gene_mapping(gene_data, gene_mapping_df)
