In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Irritable_bowel_syndrome_(IBS)/GSE25220'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcription levels in human colon biopsies in IBS and IBD patients before and after participating in a high red-meat dietary intervention"
!Series_summary	"Study 1: Transcriptomic profiles in colon tissue from inflammatory bowel diseases patients in relation to N-nitroso compound exposure and colorectal cancer risk"
!Series_summary	""
!Series_summary	"Study 1: N-nitroso compounds (NOC) have been suggested to play a role in human cancer development but definitive evidence is still lacking. In this study we investigated gene expression modifications induced in human colon tissue in relation to NOC exposure to gain insight in the relevance of these compounds in human colorectal cancer (CRC) development. Since there are indications that inflammation stimulates endogenous NOC formation, the study population consisted of patients with inflammatory bowel disease (IBD) and irritable bowel syndrome patients as controls without inflammation. Strong transc

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Step 1: Check if gene expression data is available
# Based on the series title and summaries, it seems like transcriptional (gene expression) data is being studied.
is_gene_available = True

# Step 2: Determine the availability of variables and data type conversion

# 2.1 Data Availability
# The keys in the sample characteristics dictionary give us disease status, gender, and age.
# Let's map the variables:

# For 'Irritable_bowel_syndrome_(IBS)': Check "disease status" which seems to be at key 1.
# Since key 1 holds IBS information, it is suitable to track the trait.
if 'IBS' in sample_characteristics_dict[1]:
    trait_row = 1

# For 'age': Check "age" which appears to be key 3.
if len(set(sample_characteristics_dict[3])) > 1:
    age_row = 3

# For 'gender': Check "gender" which seems to be at key 2.
if len(set(sample_characteristics_dict[2])) > 1:
    gender_row = 2

# 2.3 Data Type Conversion
# Conversion functions

# For 'Irritable_bowel_syndrome_(IBS)' (binary: IBS or IBD)
def convert_trait(value):
    if "IBS" in value:
        return 1
    elif "IBD" in value:
        return 0
    return None

# For 'age' (continuous)
def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except ValueError:
        return None

# For 'gender' (binary: male or female)
def convert_gender(value):
    gender = value.split(":")[1].strip().lower()
    if gender == 'male':
        return 1
    elif gender == 'female':
        return 0
    return None

# Save Metadata
save_cohort_info('GSE25220', './preprocessed/Irritable_bowel_syndrome_(IBS)/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Irritable_bowel_syndrome_(IBS)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Irritable_bowel_syndrome_(IBS)/trait_data/GSE25220.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Irritable_bowel_syndrome_(IBS)/cohort_info.json
