In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Peptic_ulcer_disease/GSE60427'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Toll-like receptor 10 in Helicobacter pylori infection"
!Series_summary	"Helicobacter pylori is a highly successful and important human pathogen that causes chronic gastritis, peptic ulcer diseases and gastric cancer. Innate immunity plays an important role of the primary defense against pathogens and epidemiological studies have suggested a role of toll-like receptor 1 (TLR1) in the risk of H. pylori acquisition. We performed microarray analysis of gastric mucosal biopsy specimens from H. pylori-positive and uninfected subjects; infection was associated with an ~15-fold up-regulation of TLR10 (p <0.001). Quantitative RT-PCR confirmed  TLR10 mRNA levels were increased 3-fold in H. pylori-infection (p <0.001) and immunohistochemistory using anti-TLR10 polyclonal antibodies showed increased TLR10 expression in gastric epithelial cells of infected individuals. In vitro experiments where H. pylori was co-cultured with NCI-N87 gastric cells showed sign

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking if gene expression data is available
# Since the study involves whole genome microarray expression profiling of gastric epithelium
is_gene_available = True

# Checking availability of Peptic_ulcer_disease data
# Based on the given output, there's no explicit mention or direct variable for peptic ulcer disease in the sample characteristics
# Hence, `trait_row` is set to None
trait_row = None

# Checking availability of age data
age_row = 2  # Age data is found at key 2

# Checking availability of gender data
gender_row = 1  # Gender data is found at key 1

# Define convert_age function
def convert_age(value):
    try:
        return float(value.split(':')[1])
    except:
        return None

# Define convert_gender function
def convert_gender(value):
    gender = value.split(':')[1].strip()
    if gender.lower() == 'm':
        return 1  # Male
    elif gender.lower() == 'f':
        return 0  # Female
    else:
        return None

# Saving cohort information
save_cohort_info('GSE60427', './preprocessed/Peptic_ulcer_disease/cohort_info.json', is_gene_available, trait_row is not None)
