In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hypothyroidism/GSE224330'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression of monocytes from rheumatoid arthritis patients treated with bDMARDs and methotrexate."
!Series_summary	"It is well documented that patients affected by rheumatoid arthritis (RA) have distinct susceptibility to the different biologic Disease-Modifying AntiRheumatic Drugs (bDMARDs) available on the market, probably because of the many facets of the disease. Monocytes are deeply involved in the pathogenesis of RA and we therefore evaluated and compared the transcriptomic profile of monocytes isolated from patients on treatment with methotrexate alone or in combination with tocilizumab, anti-TNFalpha or abatacept, and from healthy donors. Differential expression analysis of whole-genome transcriptomics yielded a list of regulated genes suitable for functional annotation enrichment analysis. Specifically, abatacept, tocilizumab and anti-TNFalpha cohorts were separately compared with methotrexate using a rank-product-based statistical a

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Sample characteristics data
sample_characteristics = {0: ['tissue: Isolated monocytes'], 1: ['age: 63y', 'age: 64y', 'age: 48y', 'age: 70y', 'age: 62y', 'age: 58y', 'age: 57y', 'age: 60y', 'age: 52y', 'age: 51y', 'age: 53y', 'age: 56y', 'age: 54y', 'age: 61y', 'age: 55y', 'age: 65y', 'age: 84y', 'age: 76y', 'age: 73y', 'age: 71y', 'age: 59y', 'age: 47y'], 2: ['gender: female', 'gender: male'], 3: ['comorbidity: hypothyroidism', 'comorbidity: none', 'comorbidity: osteoporosis', None, 'comorbidity: schizoaffective disorder', 'comorbidity: arthrosis']}

# Check for gene expression data
is_gene_available = True

# Safely extract keys for the variables
trait_row = 3 if any("hypothyroidism" in (elem.split(": ")[1] if isinstance(elem, str) and ": " in elem else elem) for elem in sample_characteristics[3]) else None
age_row = 1 if len(set(elem.split(": ")[1][:-1] if isinstance(elem, str) and ": " in elem and elem.split(": ")[1].strip()[-1].lower() == 'y' else None for elem in sample_characteristics[1])) > 1 else None
gender_row = 2 if len(set(elem.split(": ")[1] if isinstance(elem, str) and ": " in elem else elem for elem in sample_characteristics[2])) > 1 else None

# Define data conversion functions
def convert_trait(value):
    if not isinstance(value, str):
        return None
    value = value.split(": ")[1] if ": " in value else value
    if value == 'hypothyroidism':
        return 1
    elif value == 'none':
        return 0
    else:
        return None

def convert_age(value):
    if not isinstance(value, str):
        return None
    value = value.split(": ")[1][:-1] if ": " in value and value.split(": ")[1].strip()[-1].lower() == 'y' else value
    try:
        return int(value)
    except ValueError:
        return None

def convert_gender(value):
    if not isinstance(value, str):
        return None
    value = value.split(": ")[1] if ": " in value else value
    if value == 'female':
        return 0
    elif value == 'male':
        return 1
    else:
        return None

# Save Metadata
save_cohort_info('GSE224330', './preprocessed/Hypothyroidism/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if trait_row is not None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hypothyroidism', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hypothyroidism/trait_data/GSE224330.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM7019507': [1, 63, 0], 'GSM7019508': [0, 64, 1], 'GSM7019509': [0, 63, 0], 'GSM7019510': [0, 48, 0], 'GSM7019511': [None, 70, 1], 'GSM7019512': [None, 62, 1], 'GSM7019513': [None, 58, 1], 'GSM7019514': [None, 57, 1], 'GSM7019515': [None, 60, 0], 'GSM7019516': [None, 57, 0], 'GSM7019517': [None, 52, 0], 'GSM7019518': [None, 51, 0], 'GSM7019519': [None, 53, 0], 'GSM7019520': [None, 56, 0], 'GSM7019521': [None, 62, 1], 'GSM7019522': [0, 54, 0], 'GSM7019523': [0, 61, 0], 'GSM7019524': [None, 54, 0], 'GSM7019525': [0, 55, 1], 'GSM7019526': [0, 65, 0], 'GSM7019527': [0, 84, 0], 'GSM7019528': [None, 70, 0], 'GSM7019529': [None, 76, 0], 'GSM7019530': [None, 62, 0], 'GSM7019531': [0, 73, 1], 'GSM7019532': [0, 71, 0], 'GSM7019533': [0, 59, 0], 'GSM7019534': [0, 62, 1], 'GSM7019535': [0, 47, 0], 'GSM7019536': [None, 76, 0], 'GSM7019537': [0, 54, 0]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['A_19_P00315452', 'A_19_P00315492', 'A_19_P00315493', 'A_19_P00315502',
       'A_19_P00315506', 'A_19_P00315518', 'A_19_P00315519', 'A_19_P00315529',
       'A_19_P00315541', 'A_19_P00315543', 'A_19_P00315551', 'A_19_P00315581',
       'A_19_P00315584', 'A_19_P00315593', 'A_19_P00315603', 'A_19_P00315625',
       'A_19_P00315627', 'A_19_P00315631', 'A_19_P00315641', 'A_19_P00315647'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['GE_BrightCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872', 'A_33_P3267760'], 'CONTROL_TYPE': ['pos', 'pos', 'FALSE', 'FALSE', 'FALSE'], 'REFSEQ': [nan, nan, nan, 'NM_001105533', nan], 'GB_ACC': [nan, nan, nan, 'NM_001105533', nan], 'LOCUSLINK_ID': [nan, nan, nan, 79974.0, 54880.0], 'GENE_SYMBOL': [nan, nan, nan, 'CPED1', 'BCOR'], 'GENE_NAME': [nan, nan, nan, 'cadherin-like and PC-esterase domain containing 1', 'BCL6 corepressor'], 'UNIGENE_ID': [nan, nan, nan, 'Hs.189652', nan], 'ENSEMBL_ID': [nan, nan, nan, nan, 'ENST00000378463'], 'ACCESSION_STRING': [nan, nan, nan, 'ref|NM_001105533|gb|AK025639|gb|BC030538|tc|THC2601673', 'ens|ENST00000378463'], 'CHROMOSOMAL_LOCATION': [nan, nan, 'unmapped', 'chr7:120901888-120901947', 'chrX:39909128-39909069'], 'CYTOBAND': [nan, nan, nan, 'hs|7q31.31', 'hs|Xp11.4'], 'DESCRIPTION': [nan, nan, nan, 'Homo sapiens cadherin-like and PC-esterase domain containing 1 (CPED1), transcript variant 2, mRNA [NM_001105533

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identifying the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function from the library.
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library, and name the resulting gene expression dataframe "gene_data".
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Hypothyroidism/gene_data/GSE224330.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Hypothyroidism')

# 4. Save the cohort information.
json_path = './preprocessed/Hypothyroidism/cohort_info.json'
save_cohort_info('GSE224330', json_path, True, True, trait_biased, merged_data)

# 5. If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    csv_path = './preprocessed/Hypothyroidism/GSE224330.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Hypothyroidism', the least common label is '1.0' with 1 occurrences. This represents 6.67% of the dataset.
The distribution of the feature 'Hypothyroidism' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 54.5
  50% (Median): 62.0
  75%: 64.5
Min: 47.0
Max: 84.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 4 occurrences. This represents 26.67% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

