In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Rheumatoid_Arthritis/GSE236924'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"SIRPa agonist antibody treatment ameliorates experimental arthritis and colitis [array]"
!Series_summary	"The innate immune system is finely tuned to enable. rapid response to pathogenic stimuli but keep quiescent during tissue homeostasis. Balance of activating and inhibitory signaling sets a threshold for immune activation. Signal regulatory protein (SIRPa) is an immune inhibitory receptor expressed by myeloid cells and interacts with CD47 to inhibit immune cell phagocytosis, migration, and activation. Despite the progress of SIRPa and CD47 antagonist antibodies to promote anti-cancer immunity, it is not yet known whether therapeutic SIRPa receptor agonism could restrain excessive autoimmune inflammation in the context of autoimmunity. Here, we reported that increased neutrophil- and monocyte-associated genes including SIRPA in inflamed tissues biopsies of rheumatoid arthritis and inflammatory bowel diseases, and elevated SIRPA in colonic biopsi

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Based on the background information and Sample Characteristics Dictionary, we analyze gene availability and sample variables.

# Step 1: Check if gene expression data is available
if "array" in "!Series_title" or "gene expression" in "!Series_summary":
    is_gene_available = True

# Step 2: Analyzing the Sample Characteristics Dictionary
# Rheumatoid_Arthritis data
if 0 in {0: ['disease: OA', 'disease: Control', 'disease: RA']}:
    trait_row = 0

# Defining data conversion functions
def convert_trait(value: str):
    """Convert Rheumatoid Arthritis variable to binary type."""
    if 'RA' in value:
        return 1
    elif 'Control' in value or 'OA' in value:
        return 0
    else:
        return None

# Since age and gender are not explicitly present in the characteristics dictionary, we cannot convert them.
# Hence, age_row and gender_row remain None and no conversion functions are defined for them.

# Save cohort information
save_cohort_info('GSE236924', './preprocessed/Rheumatoid_Arthritis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Rheumatoid_Arthritis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Rheumatoid_Arthritis/trait_data/GSE236924.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM7585682': [0], 'GSM7585683': [0], 'GSM7585684': [0], 'GSM7585685': [0], 'GSM7585686': [1], 'GSM7585687': [0], 'GSM7585688': [0], 'GSM7585689': [0], 'GSM7585690': [0], 'GSM7585691': [1], 'GSM7585692': [0], 'GSM7585693': [1], 'GSM7585694': [0], 'GSM7585695': [0], 'GSM7585696': [0], 'GSM7585697': [1], 'GSM7585698': [1], 'GSM7585699': [0], 'GSM7585700': [0], 'GSM7585701': [1], 'GSM7585702': [0], 'GSM7585703': [0], 'GSM7585704': [0], 'GSM7585705': [0], 'GSM7585706': [0], 'GSM7585707': [0], 'GSM7585708': [1], 'GSM7585709': [1], 'GSM7585710': [0], 'GSM7585711': [0], 'GSM7585712': [0], 'GSM7585713': [1], 'GSM7585714': [0], 'GSM7585715': [0], 'GSM7585716': [1], 'GSM7585717': [0], 'GSM7585718': [0], 'GSM7585719': [0], 'GSM7585720': [0], 'GSM7585721': [0], 'GSM7585722': [1], 'GSM7585723': [1], 'GSM7585724': [0], 'GSM7585725': [0], 'GSM7585726': [0], 'GSM7585727': [0], 'GSM7585728': [0], 'GSM7585729': [1], 'GSM7585730': [0], 'GSM7585731': [0], 'GSM7585732': [0], 'GSM7585733': [0], 'GSM7585734