In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Peptic_ulcer_disease/GSE32174'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"microRNA profiling in duodenal ulcer disease caused by Helicobacter pylori infection in a Western population."
!Series_summary	"The aim of this study was to identify and assess the utility of miRNAs as diagnostic surrogate markers for H.pylori infection. For this purpose, we analyzed the miRNA expression profile by microarrays in the antral mucosa of well characterized dyspeptic patients and then applied the most significant set of miRNAs to an independent validation group. Our results shows that a set of miRNAs are deregulated during chronic gastric inflammation and that this set may be may be useful as a surrogate marker for determining the presence of H.pylori."
!Series_overall_design	"Total RNA from antral biopsies from patients with Helicobacter pylory (Hp) infection (with or without virulence factor caG) and healthy volunteers were isolated. MiRNA expression profiles were analyzed using miRNA microarray platform."
Sample Characteristics Dict

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# The sample characteristics dictionary from STEP 1
sample_characteristics = {
    0: ['gender: Male', 'gender: Female'], 
    1: ['age: 47', 'age: 43', 'age: 42', 'age: 33', 'age: 36', 'age: 35', 'age: 67', 'age: 38', 'age: 39', 'age: 56', 'age: 63', 'age: 58', 'age: 41', 'age: 64', 'age: 53', 'age: 21', 'age: 25', 'age: 55', 'age: 49', 'age: 45', 'age: 20', 'age: 68', 'age: 37', 'age: 65', 'age: 19', 'age: 80', 'age: 57', 'age: 66', 'age: 50', 'age: 27'], 
    2: ['hp_presence: yes', 'hp_presence: no'], 
    3: ['caga presence: yes', 'caga presence: no']
}

# Check if gene expression data is available
# Based on the background information, it appears the study is focused on miRNA expression profiling, not gene expression.
is_gene_available = False

# Check data availability for variables
# The given sample characteristics do not directly mention 'Peptic_ulcer_disease'.
# Age and gender data availability check:
if 'gender: Male' in sample_characteristics[0]:
    gender_row = 0
if 'age: 47' in sample_characteristics[1]:
    age_row = 1

# Define data type conversion functions
def convert_trait(value):
    return None  # Since trait_row is not available

def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    try:
        gender = value.split(':')[1].strip().lower()
        if gender == 'male':
            return 1
        elif gender == 'female':
            return 0
        else:
            return None
    except:
        return None

# Save cohort information
save_cohort_info('GSE32174', './preprocessed/Peptic_ulcer_disease/cohort_info.json', is_gene_available, trait_row is not None)

# Since trait_row is None, no need to extract and save clinical feature data
