In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Osteoarthritis/GSE141934'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptional data of inflamatory arthritis T cells."
!Series_summary	"With a focus on rheumatoid arthritis (RA), we sought new insight into genetic mechanisms of adaptive immune dysregulation to help prioritise molecular pathways for targeting in this and related immune pathologies. Whole genome methylation and transcriptional data from isolated CD4+ T cells and B cells of >100 genotyped and phenotyped inflammatory arthritis patients, all of whom were naïve to immunomodulatory treatments, were obtained. Analysis integrated these comprehensive data with GWAS findings across IMDs and other publically available resources."
!Series_overall_design	"Suspected inflammatory arthritis patients of Northern European ancestry were recruited prior to treatment with immunomodulatory drugs. RA patients were classified using current, internationally accepted criteria, and matched with disease controls in respect of demographic and clinical characteristics. CD4

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# Parse the summary and design of the series to check for gene expression data.
# Keywords to look for are "transcriptional data" which suggests mRNA (gene expression).

is_gene_available = "transcriptional data" in "!Series_summary\t\"With a focus on rheumatoid arthritis (RA), we sought new insight into genetic mechanisms of adaptive immune dysregulation to help prioritise molecular pathways for targeting in this and related immune pathologies."  # Found in summary

# 2. Variable Availability and Data Type Conversion

# Define age_row by checking the dictionary
if 2 in {2: ['age: 50', 'age: 43', 'age: 66', 'age: 55', 'age: 52', 'age: 54', 'age: 63', 'age: 61', 'age: 58', 'age: 79', 'age: 69', 'age: 57', 'age: 46', 'age: 44', 'age: 59', 'age: 81', 'age: 60', 'age: 92', 'age: 45', 'age: 47', 'age: 27', 'age: 38', 'age: 51', 'age: 70', 'age: 56', 'age: 53', 'age: 74', 'age: 49', 'age: 31', 'age: 65']} and len({2: ['age: 50', 'age: 43', 'age: 66', 'age: 55', 'age: 52', 'age: 54', 'age: 63', 'age: 61', 'age: 58', 'age: 79', 'age: 69', 'age: 57', 'age: 46', 'age: 44', 'age: 59', 'age: 81', 'age: 60', 'age: 92', 'age: 45', 'age: 47', 'age: 27', 'age: 38', 'age: 51', 'age: 70', 'age: 56', 'age: 53', 'age: 74', 'age: 49', 'age: 31', 'age: 65']}[2]) > 1:
    age_row = 2

# Define gender_row by checking the dictionary
if 1 in {1: ['gender: F', 'gender: M']} and len({1: ['gender: F', 'gender: M']}[1]) > 1:
    gender_row = 1

# Define trait_row by checking the dictionary
if 6 in {6: ['working_diagnosis: Rheumatoid Arthritis', 'working_diagnosis: Psoriatic Arthritis', 'working_diagnosis: Reactive Arthritis', 'working_diagnosis: Crystal Arthritis', 'working_diagnosis: Osteoarthritis', 'working_diagnosis: Non-Inflammatory', 'working_diagnosis: Undifferentiated Inflammatory Arthritis', 'working_diagnosis: Other Inflammatory Arthritis', 'working_diagnosis: Enteropathic Arthritis', 'working_diagnosis: Undifferentiated Spondylo-Arthropathy', 'working_diagnosis: Unknown']} and len({6: ['working_diagnosis: Rheumatoid Arthritis', 'working_diagnosis: Psoriatic Arthritis', 'working_diagnosis: Reactive Arthritis', 'working_diagnosis: Crystal Arthritis', 'working_diagnosis: Osteoarthritis', 'working_diagnosis: Non-Inflammatory', 'working_diagnosis: Undifferentiated Inflammatory Arthritis', 'working_diagnosis: Other Inflammatory Arthritis', 'working_diagnosis: Enteropathic Arthritis', 'working_diagnosis: Undifferentiated Spondylo-Arthropathy', 'working_diagnosis: Unknown']}[6]) > 1:
    if 'working_diagnosis: Osteoarthritis' in {6: ['working_diagnosis: Rheumatoid Arthritis', 'working_diagnosis: Psoriatic Arthritis', 'working_diagnosis: Reactive Arthritis', 'working_diagnosis: Crystal Arthritis', 'working_diagnosis: Osteoarthritis', 'working_diagnosis: Non-Inflammatory', 'working_diagnosis: Undifferentiated Inflammatory Arthritis', 'working_diagnosis: Other Inflammatory Arthritis', 'working_diagnosis: Enteropathic Arthritis', 'working_diagnosis: Undifferentiated Spondylo-Arthropathy', 'working_diagnosis: Unknown']}[6]:
        trait_row = 6

# 2.3 Data Type Conversion

# Define convert_trait function
def convert_trait(value):
    val = value.split(":")[1].strip()
    if val == 'Osteoarthritis':
        return 1
    elif val in ['Rheumatoid Arthritis', 'Psoriatic Arthritis', 'Reactive Arthritis', 'Crystal Arthritis', 'Non-Inflammatory', 'Undifferentiated Inflammatory Arthritis', 'Other Inflammatory Arthritis', 'Enteropathic Arthritis', 'Undifferentiated Spondylo-Arthropathy', 'Unknown']:
        return 0
    else:
        return None

# Define convert_age function
def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except ValueError:
        return None

# Define convert_gender function
def convert_gender(value):
    val = value.split(":")[1].strip()
    if val == 'F':
        return 0
    elif val == 'M':
        return 1
    else:
        return None

# 3. Save Metadata
save_cohort_info('GSE141934', './preprocessed/Osteoarthritis/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Osteoarthritis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Osteoarthritis/trait_data/GSE141934.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4216498': [0.0, 50.0, 0.0], 'GSM4216499': [0.0, 43.0, 0.0], 'GSM4216500': [0.0, 66.0, 0.0], 'GSM4216501': [0.0, 55.0, 1.0], 'GSM4216502': [0.0, 52.0, 0.0], 'GSM4216503': [0.0, 54.0, 0.0], 'GSM4216504': [0.0, 63.0, 0.0], 'GSM4216505': [0.0, 61.0, 1.0], 'GSM4216506': [0.0, 58.0, 1.0], 'GSM4216507': [0.0, 79.0, 1.0], 'GSM4216508': [0.0, 69.0, 0.0], 'GSM4216509': [1.0, 57.0, 0.0], 'GSM4216510': [0.0, 46.0, 0.0], 'GSM4216511': [0.0, 44.0, 0.0], 'GSM4216512': [0.0, 46.0, 0.0], 'GSM4216513': [0.0, 63.0, 0.0], 'GSM4216514': [0.0, 59.0, 1.0], 'GSM4216515': [0.0, 81.0, 1.0], 'GSM4216516': [0.0, 60.0, 1.0], 'GSM4216517': [0.0, 92.0, 1.0], 'GSM4216518': [0.0, 45.0, 0.0], 'GSM4216519': [0.0, 47.0, 0.0], 'GSM4216520': [0.0, 27.0, 0.0], 'GSM4216521': [0.0, 58.0, 0.0], 'GSM4216522': [0.0, 57.0, 0.0], 'GSM4216523': [1.0, 38.0, 0.0], 'GSM4216524': [0.0, 45.0, 1.0], 'GSM4216525': [0.0, 51.0, 1.0], 'GSM4216526': [0.0, 70.0, 1.0], 'GSM4216527': [0.0, 57.0, 1.0], 'GSM4216528': [0.0, 56.0, 0.0], 'GSM421