In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Underweight/GSE163902'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Multipathogen analysis of IgA and IgG antigen specificity for selected pathogens in milk produced by women from diverse geographical regions: The INSPIRE Study [IgG]"
!Series_summary	"Breastfeeding provides defense against infectious disease during early life. The mechanisms underlying this protection are complex but likely include the vast array of immune cells and components, such as immunoglobulins, in milk. Simply characterizing the concentrations of these bioactives, however, provides only limited information regarding their potential relationships with disease risk in the recipient infant. Rather, understanding pathogen and antigen specificity profiles of milk-borne immunoglobulins might lead to a more complete understanding of how maternal immunity impacts infant health and wellbeing. Milk produced by women living in 11 geographically dispersed populations was applied to a protein microarray containing antigens from 16 pathogens, including 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = False  # Based on the description, it appears to be a protein microarray for IgG binding. No indication of gene expression data.

# Check for data availability in variables
trait_row = None  # No explicit indication of 'Underweight' data
age_row = 1  # Age data is available at key 1
gender_row = 2  # Gender data is inferred to be at key 2 ('infant's sex')

# Define data conversion functions

def convert_trait(value):
    return None  # Since underweight data is not available, this function is not required

def convert_age(value):
    try:
        age = value.split(':')[1].strip()
        return float(age) if age != "NA" else None
    except IndexError:
        return None

def convert_gender(value):
    try:
        gender = value.split(':')[1].strip()
        if gender == "0":
            return 0  # Female
        elif gender == "1":
            return 1  # Male
        else:
            return None  # No Data
    except IndexError:
        return None

# Save cohort information
save_cohort_info('GSE163902', './preprocessed/Underweight/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Underweight', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Underweight/trait_data/GSE163902.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
