In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Celiac_Disease/GSE87629'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genome-wide analysis of B and T cell gene expression during a six-week gluten challenge in patients with celiac disease"
!Series_summary	"Dietary gluten proteins (prolamins) from wheat, rye, and barley are the driving forces behind celiac disease, an organ-specific autoimmune disorder that targets both the small intestine and organs outside the gut. In the small intestine, gluten induces inflammation and a typical morphological change of villous atrophy and crypt hyperplasia. Gut lesions improve and heal when gluten is excluded from the diet and the disease relapses when patients consume gluten. Oral immune tolerance towards gluten may be kept for years or decades before breaking tolerance in genetically susceptible individuals. Celiac disease provides a unique opportunity to study autoimmunity and the transition in immune cells as gluten breaks oral tolerance. Seventy-three celiac disease patients on a long-term gluten-free diet ingested a known 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data
if any('gene expression' in info.lower() or 'microarray' in info.lower() for info_list in sample_characteristics_dict.values() for info in info_list):
    is_gene_available = True

# Identify the available rows for each variable
trait_row = 1 if any('celiac disease' in info.lower() for info in sample_characteristics_dict[1]) else None
age_row = None  # No age information found in given data
gender_row = None  # No gender information found in given data

# Define the conversion functions
def convert_trait(value):
    if value:
        if 'control' in value.lower():
            return 0
        elif 'gluten challenge' in value.lower():
            return 1
    return None

# Adjusted conversion functions to return None when rows not available
convert_age = lambda value: None
convert_gender = lambda value: None

# Save metadata
save_cohort_info('GSE87629', './preprocessed/Celiac_Disease/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    try:
        selected_clinical_data = geo_select_clinical_features(clinical_data, 'Celiac_Disease', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
        csv_path = './preprocessed/Celiac_Disease/trait_data/GSE87629.csv'
        selected_clinical_data.to_csv(csv_path)
        print(preview_df(selected_clinical_data))
    except Exception as e:
        print(f"Clinical feature extraction error: {e}")


{'GSM2335776': [None], 'GSM2335777': [None], 'GSM2335778': [None], 'GSM2335779': [None], 'GSM2335780': [None], 'GSM2335781': [None], 'GSM2335782': [None], 'GSM2335783': [None], 'GSM2335784': [None], 'GSM2335785': [None], 'GSM2335786': [None], 'GSM2335787': [None], 'GSM2335788': [None], 'GSM2335789': [None], 'GSM2335790': [None], 'GSM2335791': [None], 'GSM2335792': [None], 'GSM2335793': [None], 'GSM2335794': [None], 'GSM2335795': [None], 'GSM2335796': [None], 'GSM2335797': [None], 'GSM2335798': [None], 'GSM2335799': [None], 'GSM2335800': [None], 'GSM2335801': [None], 'GSM2335802': [None], 'GSM2335803': [None], 'GSM2335804': [None], 'GSM2335805': [None], 'GSM2335806': [None], 'GSM2335807': [None], 'GSM2335808': [None], 'GSM2335809': [None], 'GSM2335810': [None], 'GSM2335811': [None], 'GSM2335812': [None], 'GSM2335813': [None], 'GSM2335814': [None], 'GSM2335815': [None], 'GSM2335816': [None], 'GSM2335817': [None], 'GSM2335820': [None], 'GSM2335821': [None], 'GSM2335822': [None], 'GSM23358