In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Psoriatic_Arthritis/GSE69371'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Autoantibodies and nucleic acids skew complement consumption in systemic lupus erythematosus [IgM]"
!Series_summary	"Systemic lupus erythematosus is a chronic autoimmune disease with multifactorial ethiopathogenesis. The complement system is involved in both the early and late stages of disease development and organ damage. To better understand autoantibody mediated complement consumption the GAPAID consortium examined ex vivo immune complex formation on autoantigen arrays.  We recruited patients with SLE (n=211), with other systemic autoimmune diseases (n=65) and non-autoimmune control subjects (n=149) in two rheumatology tertiary care centers. Standard clinical and laboratory data were collected from all subjects and serum complement levels were determined in SLE patients. The genotype of SNP rs1143679 in the ITGAM gene was also determined. On-chip formation of immune complexes was examined using a functional immunoassay on autoantigen microarra

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Placeholder for dataset type (assuming 'gene_expression' for this example)
dataset_type = 'gene_expression'

# Check gene expression data availability
if dataset_type == 'gene_expression':  # Assuming 'dataset_type' provides dataset type information
    is_gene_available = True

# Inspect Sample Characteristics Dictionary for variable availability
sample_char_dict = {
    0: ['gender: Female', 'gender: Male'],
    1: ["disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): SLE", 
        "disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): UCTD", 
        "disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): SjS", 
        "disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): NHS", 
        "disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): PsA", 
        "disease group (nhs-normal human serum; sle-systemic lupus erythematosus ;uctd-undifferentiated connective tissue disease; sjs-sjörgen's syndrome;ssc-systemic sclerosis; psa-psoriatic arthritis): SSc"],
    2: ['age in years: 34', 'age in years: 31', 'age in years: 38', 'age in years: 30', 'age in years: 23', 'age in years: 43', 'age in years: 48', 'age in years: 28', 
        'age in years: 25', 'age in years: 68', 'age in years: 39', 'age in years: 49', 'age in years: 55', 'age in years: 61', 'age in years: 47', 'age in years: 70', 
        'age in years: 19', 'age in years: 53', 'age in years: 75', 'age in years: 35', 'age in years: 67', 'age in years: 42', 'age in years: 27', 'age in years: 57', 
        'age in years: 73', 'age in years: 80', 'age in years: 71', 'age in years: 51', 'age in years: 74', 'age in years: 24']
}

# Analyze 'Psoriatic_Arthritis' availability
if any('psa-psoriatic arthritis' in characteristic.lower() for characteristic in sample_char_dict[1]):
    trait_row = 1

# Ensure that "Psoriatic Arthritis" has distinct entries for study
if trait_row is not None:
    psa_samples_count = sum('psa-psoriatic arthritis' in characteristic.lower() for characteristic in sample_char_dict[1])
    if psa_samples_count == len(sample_char_dict[1]):
        trait_row = None  # Reset if no distinct entries

# Analyze 'age' availability
if any('age in years' in characteristic.lower() for characteristic in sample_char_dict[2]):
    age_row = 2

# Analyze 'gender' availability
if any('gender' in characteristic.lower() for characteristic in sample_char_dict[0]):
    gender_row = 0

# Define conversion functions
def convert_trait(value):
    val = value.split(':')[-1].strip().lower()
    if 'psa-psoriatic arthritis' in val:
        return 1
    elif 'nhs' in val or 'sle' in val or 'uctd' in val or 'sjs' in val or 'ssc' in val:
        return 0
    return None

def convert_age(value):
    try:
        return int(value.split(':')[-1].strip())
    except ValueError:
        return None

def convert_gender(value):
    val = value.split(':')[-1].strip().lower()
    if val == 'female':
        return 0
    elif val == 'male':
        return 1
    return None

# Save cohort information
save_cohort_info('GSE69371', './preprocessed/Psoriatic_Arthritis/cohort_info.json', is_gene_available, trait_row is not None)

# If clinical data for 'Psoriatic_Arthritis' is available, proceed with clinical features extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Psoriatic_Arthritis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Psoriatic_Arthritis/trait_data/GSE69371.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Psoriatic_Arthritis/cohort_info.json
