In [1]:
import os
import sys

sys.path.append('..')
from utils import *

# Set your preferred name
USER = "Arthur"
# Set the data and output directories
DATA_ROOT = 'C:/Users/arthu/Downloads'
OUTPUT_ROOT = 'C:/Users/arthu/OneDrive/Documents/GitHub/output2'
TRAIT = 'Werner-Syndrome'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

# Gene symbol normalization may take 1-2 minutes. You may set it to False for debugging.
NORMALIZE_GENE = True

utils.py has been loaded


2. Data preprocessing and selection

The GEO dataset

In [2]:
dataset = 'GEO'
trait_subdir = "Werner-Syndrome"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE48761', 'GSE62877']

In [23]:
cohort = accession_num = "GSE62877"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('C:/Users/arthu/Downloads\\GEO\\Werner-Syndrome\\GSE62877\\GSE62877_family.soft.gz',
 'C:/Users/arthu/Downloads\\GEO\\Werner-Syndrome\\GSE62877\\GSE62877-GPL14592_series_matrix.txt.gz')

In [24]:
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Werner syndrome WRN helicase alters gene expression in a G-quadruplex DNA-dependent manner to antagonize a pro-senescence gene expression program"
!Series_summary	"Werner syndrome (WS) is a human adult progeroid syndrome caused by loss-of-function mutations in the WRN RECQ helicase gene. We analyzed mRNA and miRNA expression in fibroblasts from WS patients and in fibroblasts depleted of WRN protein in order to determine the role of WRN in transcription regulation, and to identify genes and miRNAs that might drive WS disease pathogenesis. Genes altered in WS cells participate in cellular growth, proliferation and survival; in tRNA charging and in oncogenic signaling; and in connective tissue and developmental networks. Genes down-regulated in WS cells were highly enriched in Gquadruplex (G4) DNA motifs, indicating G4 motifs are physiologic substrates for WRN. In contrast, there was a remarkable, coordinate up-regulation of nearly all of the cytoplasmic tRNA synthetases an

In [45]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM1317006,GSM1317010,GSM1317011,GSM1317013,GSM1317016,GSM1317018,GSM1317021,GSM1317022,GSM1317024,GSM1535467,GSM1535468,GSM1535469,GSM1535470,GSM1535471,GSM1535472
0,!Sample_characteristics_ch1,cell line: GM00730,cell line: GM01651,cell line: GM01948,cell line: GM02185,cell line: GM02674,cell line: GM03377,cell line: GM03651,cell line: GM04260,cell line: GM07532,tissue: primary fibroblast,tissue: primary fibroblast,tissue: primary fibroblast,tissue: primary fibroblast,tissue: primary fibroblast,tissue: primary fibroblast
1,!Sample_characteristics_ch1,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,cell type: primary fibroblast,age: 60,age: 13,age: 37,age: 30,age: 36,age: 25
2,!Sample_characteristics_ch1,blm mutation: Wildtype,blm mutation: homozygous (1544insA of RECQL3 g...,blm mutation: Wildtype,blm mutation: Wildtype,blm mutation: homozygous (6-bp del/7-bp ins] a...,blm mutation: homozyguous (2293delC of RECQL3 ...,blm mutation: compound heterozygous (3261delT ...,blm mutation: compound heterozygous ([2015A>G]...,blm mutation: Q700X missen mutation in BLM pro...,gender: M,gender: F,gender: M,gender: F,gender: M,gender: M
3,!Sample_characteristics_ch1,clinical features: Not clinically affected,clinical features: Bloom syndrome,clinical features: Not clinically affected,clinical features: Not clinically affected,clinical features: Bloom syndrome,clinical features: Bloom syndrome,clinical features: Bloom syndrome,clinical features: Bloom syndrome,clinical features: Bloom syndrome,,,,,,
4,!Sample_characteristics_ch1,age: 45,age: 13,age: 27,age: 36,age: 29,age: 19,age: 25,age: 60,age: 16,,,,,,


In [26]:
clinical_data_unique = get_unique_values_by_row(clinical_data)
clinical_data_unique

{0: ['cell line: GM00730',
  'cell line: GM01651',
  'cell line: GM01948',
  'cell line: GM02185',
  'cell line: GM02674',
  'cell line: GM03377',
  'cell line: GM03651',
  'cell line: GM04260',
  'cell line: GM07532',
  'tissue: primary fibroblast'],
 1: ['cell type: primary fibroblast',
  'age: 60',
  'age: 13',
  'age: 37',
  'age: 30',
  'age: 36',
  'age: 25'],
 2: ['blm mutation: Wildtype',
  'blm mutation: homozygous (1544insA of RECQL3 gene)',
  'blm mutation: homozygous (6-bp del/7-bp ins] at nucleotide 2,281 of RECQL3 gene)',
  'blm mutation: homozyguous (2293delC of RECQL3 gene)',
  'blm mutation: compound heterozygous (3261delT and 2281delT of RECQL3 gene',
  'blm mutation: compound heterozygous ([2015A>G] and [IVS5-2A>G] of RECQL3 gene)',
  'blm mutation: Q700X missen mutation in BLM protein',
  'gender: M',
  'gender: F'],
 3: ['clinical features: Not clinically affected',
  'clinical features: Bloom syndrome',
  nan],
 4: ['age: 45',
  'age: 13',
  'age: 27',
  'age: 36'

In [27]:

f'''As a biomedical research team, we are selecting datasets to study the association between the human trait \'{TRAIT}\' and genetic factors, optionally considering the influence of age and gender. After searching the GEO database and parsing the matrix file of a series, we obtained background information and sample characteristics data. We will provide textual information about the dataset background, and a Python dictionary storing a list of unique values for each field of the sample characteristics data. Please carefully review the provided information and answer the following questions about this dataset:
1. Does this dataset contain gene expression data? (Note: Pure miRNA data is not suitable.)
2. For each of the traits \'{TRAIT}\', 'age', and 'gender', please address these points:
   (1) Is there human data available for this trait?
   (2) If so, identify the key in the sample characteristics dictionary where unique values of this trait is recorded. The key is an integer. The trait information might be explicitly recorded, or can be inferred from the field with some biomedical knowledge or understanding about the data collection process.
   (3) Choose an appropriate data type (either 'continuous' or 'binary') for each trait. Write a Python function to convert any given value of the trait to this data type. The function should handle inference about the trait value and convert unknown values to None.
   Name the functions 'convert_trait', 'convert_age', and 'convert_gender', respectively.

Background information about the dataset:
{background_info}

Sample characteristics dictionary (from "!Sample_characteristics_ch1", converted to a Python dictionary that stores the unique values for each field):
{clinical_data_unique}
'''
     

'As a biomedical research team, we are selecting datasets to study the association between the human trait \'Werner-Syndrome\' and genetic factors, optionally considering the influence of age and gender. After searching the GEO database and parsing the matrix file of a series, we obtained background information and sample characteristics data. We will provide textual information about the dataset background, and a Python dictionary storing a list of unique values for each field of the sample characteristics data. Please carefully review the provided information and answer the following questions about this dataset:\n1. Does this dataset contain gene expression data? (Note: Pure miRNA data is not suitable.)\n2. For each of the traits \'Werner-Syndrome\', \'age\', and \'gender\', please address these points:\n   (1) Is there human data available for this trait?\n   (2) If so, identify the key in the sample characteristics dictionary where unique values of this trait is recorded. The key 

In [47]:
is_gene_availabe = True
trait_row = 3
age_row =1
gender_row = 2

trait_type = 'binary'

In [48]:
is_available = is_gene_availabe and (trait_row is not None)
if not is_available:
    save_cohort_info(cohort, JSON_PATH, is_available)
    print("This cohort is not usable. Please skip the following steps and jump to the next accession number.")

In [55]:
# Verify and use the functions generated by GPT

def convert_trait(value):
    if isinstance(value, str):
        if 'Werner syndrome' in value:
            return True
        elif 'Bloom syndrome' in value:
            return True
    return False




def convert_age(value):
    try:
        age = int(value.split(': ')[1])  # Extracting the age value
        return age
    except (IndexError, ValueError):
        return None


def convert_gender(value):
    if value == 'gender: M':
        return 'Male'
    elif value == 'gender: F':
        return 'Female'
    else:
        return None  # Convert unknown values to None



    
     

In [56]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

Unnamed: 0,GSM1317006,GSM1317010,GSM1317011,GSM1317013,GSM1317016,GSM1317018,GSM1317021,GSM1317022,GSM1317024,GSM1535467,GSM1535468,GSM1535469,GSM1535470,GSM1535471,GSM1535472
Werner-Syndrome,False,True,False,False,True,True,True,True,True,False,False,False,False,False,False
Age,,,,,,,,,,60,13,37,30,36,25
Gender,,,,,,,,,,Male,Female,Male,Female,Male,Male


In [57]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM1317006,GSM1317010,GSM1317011,GSM1317013,GSM1317016,GSM1317018,GSM1317021,GSM1317022,GSM1317024,GSM1535467,GSM1535468,GSM1535469,GSM1535470,GSM1535471,GSM1535472
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,25518,31913,19889,43757,17311,28011,26908,29231,19883,10938,35604,40101,36023,27038,24668
2,25467,29581,20771,42873,19908,26325,25309,26724,21199,10949,37349,43451,34213,27103,25537
3,26,215,334,195,86,74,123,65,141,73,392,314,303,255,271
4,94,239,318,175,79,72,138,132,157,144,245,287,326,124,177
5,389,128,273,243,215,363,319,106,117,93,312,495,303,191,161


In [58]:
gene_row_ids = genetic_data.index[:20].tolist()
gene_row_ids

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [59]:

f'''
Below are the row headers of a gene expression dataset in GEO. Based on your biomedical knowledge, are they human gene symbols, or are they some other identifiers that need to be mapped to gene symbols? Your answer should be concluded by starting a new line and strictly following this format:
requires_gene_mapping = (True or False)

Row headers:
{gene_row_ids}
'''

"\nBelow are the row headers of a gene expression dataset in GEO. Based on your biomedical knowledge, are they human gene symbols, or are they some other identifiers that need to be mapped to gene symbols? Your answer should be concluded by starting a new line and strictly following this format:\nrequires_gene_mapping = (True or False)\n\nRow headers:\n['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']\n"

In [60]:
requires_gene_mapping = False

In [61]:
if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

In [62]:
if requires_gene_mapping:
    print(f'''
    As a biomedical research team, we are analyzing a gene expression dataset, and find that its row headers are some identifiers related to genes:
    {gene_row_ids}
    To get the mapping from those identifiers to actual gene symbols, we extracted the gene annotation data from a series in the GEO database, and saved it to a Python dictionary. Please read the dictionary, and decide which key stores the identifiers, and which key stores the gene symbols. Please strictly follow this format in your answer:
    identifier_key = 'key_name1'
    gene_symbol_key = 'key_name2'

    Gene annotation dictionary:
    {gene_annotation_summary}
    ''')

In [20]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'UCSC_RefGene_Name'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [63]:
if NORMALIZE_GENE:
    genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [64]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
# The preprocessing runs through, which means is_available should be True
is_available = True

In [65]:
print(f"The merged dataset contains {len(merged_data)} samples.")

The merged dataset contains 6 samples.


In [66]:
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

For the feature 'Werner-Syndrome', the least common label is 'False' with 6 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Werner-Syndrome' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 26.25
  50% (Median): 33.0
  75%: 36.75
Min: 13
Max: 60
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is 'Female' with 2 occurrences. This represents 33.33% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



True

In [67]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)

In [68]:
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)