In [1]:
import pandas as pd
import os
import re
import json
from typing import List, Optional

In [2]:
def normalize_trait(trait):
    trait = '_'.join(trait.split())
    normalized_trait = ''.join(trait.split("'"))
    return normalized_trait

def normalize_gene_symbols(gene_symbols: List[str]) -> List[Optional[str]]:
    """Use gene synonym information extracted from the NCBI Gene database to normalize gene symbols in a list, and
    return a list of normalized symbols. Unmatched symbols are converted to None.
    """
    with open("./metadata/gene_synonym.json", "r") as f:
        synonym_dict = json.load(f)
    return [synonym_dict.get(g) for g in gene_symbols]

## Extract genes from NCBI

## Get trait-condition pairs and get gene IoUs

In [3]:
with open("../metadata/task_info.json", "r") as f:
    info = json.load(f)
t2g = {k: v['related_genes'] for k, v in info.items()}
traits = sorted(list(t2g.keys()))

In [4]:
len(traits)

132

In [5]:
trait_df = pd.read_csv('../download/new_task.csv')
all_cancers = trait_df.loc[trait_df['Type'] == '1. Cancer and Oncology-Related Disorders', 'Trait'].tolist()
all_cancers = [normalize_trait(c) for c in all_cancers]

In [6]:
"""
all_cancers =
['Breast_Cancer', 'Kidney_Papillary_Cell_Carcinoma', 'Kidney_Clear_Cell_Carcinoma', 'Head_and_Neck_Cancer', 
'Glioblastoma', 'Esophageal_Cancer', 'Endometrioid_Cancer', 'Bladder_Cancer', 'Retinoblastoma', 'Bile_Duct_Cancer', 
'Adrenocortical_Cancer', 'Acute_Myeloid_Leukemia', 'X-Linked_Lymphoproliferative_Syndrome', 'Von_Hippel_Lindau', 
'Liver_Cancer', 'Lower_Grade_Glioma', 'Large_B-cell_Lymphoma', 'Mesothelioma', 'Lung_Cancer', 
'Uterine_Corpus_Endometrial_Carcinoma', 'Uterine_Carcinosarcoma', 'Thymoma', 'Thyroid_Cancer', 'Melanoma', 
'Stomach_Cancer', 'Sarcoma', 'Testicular_Cancer', 'Rectal_Cancer', 'Prostate_Cancer', 'Pheochromocytoma_and_Paraganglioma', 
'Pancreatic_Cancer', 'Ovarian_Cancer', 'Ocular_Melanomas', 'Li-Fraumeni_Syndrome', 'Cervical_Cancer', 
'Colon_and_Rectal_Cancer', 'Kidney_Chromophobe', 'lower_grade_glioma_and_glioblastoma']
len(all_cancers) = 38
"""
print(len(all_cancers))
print(all_cancers)

38
['Breast_Cancer', 'Kidney_Papillary_Cell_Carcinoma', 'Kidney_Clear_Cell_Carcinoma', 'Head_and_Neck_Cancer', 'Glioblastoma', 'Esophageal_Cancer', 'Endometrioid_Cancer', 'Bladder_Cancer', 'Retinoblastoma', 'Bile_Duct_Cancer', 'Adrenocortical_Cancer', 'Acute_Myeloid_Leukemia', 'X-Linked_Lymphoproliferative_Syndrome', 'Von_Hippel_Lindau', 'Liver_Cancer', 'Lower_Grade_Glioma', 'Large_B-cell_Lymphoma', 'Mesothelioma', 'Lung_Cancer', 'Uterine_Corpus_Endometrial_Carcinoma', 'Uterine_Carcinosarcoma', 'Thymoma', 'Thyroid_Cancer', 'Melanoma', 'Stomach_Cancer', 'Sarcoma', 'Testicular_Cancer', 'Rectal_Cancer', 'Prostate_Cancer', 'Pheochromocytoma_and_Paraganglioma', 'Pancreatic_Cancer', 'Ovarian_Cancer', 'Ocular_Melanomas', 'Li-Fraumeni_Syndrome', 'Cervical_Cancer', 'Colon_and_Rectal_Cancer', 'Kidney_Chromophobe', 'lower_grade_glioma_and_glioblastoma']


In [7]:
cancer_traits = [c for c in traits if ((c in all_cancers) and (c not in ['Von_Hippel_Lindau', 'X-Linked_Lymphoproliferative_Syndrome']))]
print(len(cancer_traits))
print(cancer_traits)

35
['Acute_Myeloid_Leukemia', 'Adrenocortical_Cancer', 'Bile_Duct_Cancer', 'Bladder_Cancer', 'Breast_Cancer', 'Cervical_Cancer', 'Colon_and_Rectal_Cancer', 'Endometrioid_Cancer', 'Esophageal_Cancer', 'Glioblastoma', 'Head_and_Neck_Cancer', 'Kidney_Chromophobe', 'Kidney_Clear_Cell_Carcinoma', 'Kidney_Papillary_Cell_Carcinoma', 'Large_B-cell_Lymphoma', 'Liver_Cancer', 'Lower_Grade_Glioma', 'Lung_Cancer', 'Melanoma', 'Mesothelioma', 'Ocular_Melanomas', 'Ovarian_Cancer', 'Pancreatic_Cancer', 'Pheochromocytoma_and_Paraganglioma', 'Prostate_Cancer', 'Rectal_Cancer', 'Retinoblastoma', 'Sarcoma', 'Stomach_Cancer', 'Testicular_Cancer', 'Thymoma', 'Thyroid_Cancer', 'Uterine_Carcinosarcoma', 'Uterine_Corpus_Endometrial_Carcinoma', 'lower_grade_glioma_and_glioblastoma']


In [8]:
data = []
for t in traits:
    for s in traits + ['Age', 'Gender']:
        if t == s: continue
        if s in ['Age', 'Gender']: 
            iou = 1.0
        else:
            len_union = len(set(t2g[t]).union(set(t2g[s])))
            if len_union == 0: continue
            iou = len(set(t2g[t]).intersection(set(t2g[s]))) / len_union
        data.append({'Trait': t, 'Condition': s, 'IoU': iou})
rel = pd.DataFrame(data)

In [9]:
rel

Unnamed: 0,Trait,Condition,IoU
0,Acute_Myeloid_Leukemia,Adrenocortical_Cancer,0.018519
1,Acute_Myeloid_Leukemia,Age-Related_Macular_Degeneration,0.146132
2,Acute_Myeloid_Leukemia,Alcohol_Flush_Reaction,0.000000
3,Acute_Myeloid_Leukemia,Allergies,0.038462
4,Acute_Myeloid_Leukemia,Alopecia,0.095890
...,...,...,...
17521,lower_grade_glioma_and_glioblastoma,Von_Willebrand_Disease,0.000000
17522,lower_grade_glioma_and_glioblastoma,Werner_Syndrome,0.000000
17523,lower_grade_glioma_and_glioblastoma,X-Linked_Lymphoproliferative_Syndrome,0.000000
17524,lower_grade_glioma_and_glioblastoma,Age,1.000000


In [10]:
# 回头修订一下。

condition_only = ['Vitamin_D_Levels', 'LDL_Cholesterol_Levels']
male_traits = ['Prostate_Cancer', 'Testicular_Cancer']
female_traits = ['Cervical_Cancer', 'Endometriosis', 'Endometrioid_Cancer', 'Uterine_Carcinosarcoma', 'Uterine_Corpus_Endometrial_Carcinoma', 'Ovarian_Cancer', 'Polycystic_Ovary_Syndrome'] 
gender_traits = male_traits + female_traits
condition_all = ['Obesity', 'Hypertension', 'Type_2_Diabetes']

In [11]:
assert all(t in traits for t in gender_traits)

In [13]:
rel = rel[~((rel['Trait'].isin(gender_traits)) & (rel['Condition'] == 'Gender'))]
rel = rel[~rel['Trait'].isin(condition_only)]
rel

Unnamed: 0,Trait,Condition,IoU
0,Acute_Myeloid_Leukemia,Adrenocortical_Cancer,0.018519
1,Acute_Myeloid_Leukemia,Age-Related_Macular_Degeneration,0.146132
2,Acute_Myeloid_Leukemia,Alcohol_Flush_Reaction,0.000000
3,Acute_Myeloid_Leukemia,Allergies,0.038462
4,Acute_Myeloid_Leukemia,Alopecia,0.095890
...,...,...,...
17521,lower_grade_glioma_and_glioblastoma,Von_Willebrand_Disease,0.000000
17522,lower_grade_glioma_and_glioblastoma,Werner_Syndrome,0.000000
17523,lower_grade_glioma_and_glioblastoma,X-Linked_Lymphoproliferative_Syndrome,0.000000
17524,lower_grade_glioma_and_glioblastoma,Age,1.000000


In [14]:
rel = rel[~(rel['Trait'].isin(male_traits) & rel['Condition'].isin(female_traits))]
rel = rel[~(rel['Trait'].isin(female_traits) & rel['Condition'].isin(male_traits))]
rel

Unnamed: 0,Trait,Condition,IoU
0,Acute_Myeloid_Leukemia,Adrenocortical_Cancer,0.018519
1,Acute_Myeloid_Leukemia,Age-Related_Macular_Degeneration,0.146132
2,Acute_Myeloid_Leukemia,Alcohol_Flush_Reaction,0.000000
3,Acute_Myeloid_Leukemia,Allergies,0.038462
4,Acute_Myeloid_Leukemia,Alopecia,0.095890
...,...,...,...
17521,lower_grade_glioma_and_glioblastoma,Von_Willebrand_Disease,0.000000
17522,lower_grade_glioma_and_glioblastoma,Werner_Syndrome,0.000000
17523,lower_grade_glioma_and_glioblastoma,X-Linked_Lymphoproliferative_Syndrome,0.000000
17524,lower_grade_glioma_and_glioblastoma,Age,1.000000


In [15]:
rel = rel.sort_values(by='IoU', ascending=False).reset_index().drop(columns=['index'])
rel

Unnamed: 0,Trait,Condition,IoU
0,Type_2_Diabetes,Gender,1.0
1,Von_Willebrand_Disease,Age,1.0
2,Von_Willebrand_Disease,Gender,1.0
3,Von_Hippel_Lindau,Gender,1.0
4,Von_Hippel_Lindau,Age,1.0
...,...,...,...
17218,Kidney_Clear_Cell_Carcinoma,Rectal_Cancer,0.0
17219,Kidney_Clear_Cell_Carcinoma,Psoriatic_Arthritis,0.0
17220,Post-Traumatic_Stress_Disorder,Aniridia,0.0
17221,Post-Traumatic_Stress_Disorder,Angelman_Syndrome,0.0


In [16]:
selected = rel[(rel['IoU'] >= 0.5) | rel['Condition'].isin(condition_all)]
selected

Unnamed: 0,Trait,Condition,IoU
0,Type_2_Diabetes,Gender,1.0
1,Von_Willebrand_Disease,Age,1.0
2,Von_Willebrand_Disease,Gender,1.0
3,Von_Hippel_Lindau,Gender,1.0
4,Von_Hippel_Lindau,Age,1.0
...,...,...,...
16961,Kidney_Chromophobe,Obesity,0.0
17003,Kidney_Chromophobe,Type_2_Diabetes,0.0
17083,Kidney_Clear_Cell_Carcinoma,Type_2_Diabetes,0.0
17099,Multiple_Endocrine_Neoplasia_Type_2,Type_2_Diabetes,0.0


In [17]:
selected[(selected['Trait'].isin(cancer_traits)) & (selected['Condition'].isin(cancer_traits))]

Unnamed: 0,Trait,Condition,IoU
251,Lung_Cancer,Breast_Cancer,0.851852
252,Breast_Cancer,Lung_Cancer,0.851852
253,Prostate_Cancer,Lung_Cancer,0.801802
254,Lung_Cancer,Prostate_Cancer,0.801802
255,Melanoma,Ovarian_Cancer,0.785714
...,...,...,...
500,Thyroid_Cancer,Bladder_Cancer,0.503759
501,Acute_Myeloid_Leukemia,Bladder_Cancer,0.503759
502,Acute_Myeloid_Leukemia,Glioblastoma,0.503759
505,Bladder_Cancer,Acute_Myeloid_Leukemia,0.503759


In [18]:
# Please help me write the code to aggregate the conditions corresponding to each trait in the 'selected' DataFrame into a list
# Originally, t2g is a dictionary with trait names as keys and a list of related genes as values. Now, please modify it 
# to a new dictionary with the same traits as keys, while the value is a dictionary with key 'related_genes' storing the orignial
# list of genes, and key 'conditions' storing the list of conditions corresponding to the trait. 
# DO NOT save to any file yet.

In [19]:
# Create a new dictionary to store both genes and conditions for each trait
new_task_info = {}

# Iterate through each trait in t2g
for trait in t2g:
    # Get the conditions for this trait from the selected DataFrame
    trait_conditions = selected.loc[selected['Trait'] == trait, 'Condition'].unique().tolist()
    
    # Create the new dictionary structure for this trait
    new_task_info[trait] = {
        'related_genes': t2g[trait],  # Original list of genes
        'conditions': trait_conditions       # List of conditions
    }

In [20]:
new_task_info['Acute_Myeloid_Leukemia']

{'related_genes': ['TP53',
  'EGFR',
  'TNF',
  'IL6',
  'VEGFA',
  'TGFB1',
  'MTHFR',
  'HIF1A',
  'ERBB2',
  'ESR1',
  'IL10',
  'APP',
  'STAT3',
  'BRCA1',
  'ACE',
  'KRAS',
  'VDR',
  'MMP9',
  'CD274',
  'CRP',
  'ADIPOQ',
  'AKT1',
  'ABCB1',
  'NFKB1',
  'IL1B',
  'CTNNB1',
  'PTEN',
  'CDKN2A',
  'TLR4',
  'PTGS2',
  'TERT',
  'MYC',
  'CXCL8',
  'MTOR',
  'PPARG',
  'CDH1',
  'IGF1',
  'HLA-B',
  'LEP',
  'BCL2',
  'BRCA2',
  'CXCR4',
  'NFE2L2',
  'JAK2',
  'MDM2',
  'GSTM1',
  'IL17A',
  'MMP2',
  'SIRT1',
  'CCND1',
  'MIR21',
  'CCL2',
  'PIK3CA',
  'HMGB1',
  'CDKN1A',
  'NPPB',
  'CTLA4',
  'CD44',
  'TLR2',
  'BIRC5',
  'NOTCH1',
  'EZH2',
  'ATM',
  'GSTT1',
  'MET',
  'GSTP1',
  'CXCL12',
  'NR3C1',
  'PDCD1',
  'YAP1',
  'SPP1',
  'NLRP3',
  'HMOX1',
  'KIT',
  'RELA',
  'PARP1',
  'XRCC1',
  'GSK3B',
  'HLA-G',
  'FTO',
  'FAS',
  'IGF1R',
  'MAPK14',
  'ITGB1',
  'MUC1',
  'LCN2',
  'MLH1',
  'NOD2',
  'ITGB3',
  'ABCG2',
  'IDH1',
  'MIR146A',
  'CYP1A1',
  'HB

In [21]:
with open("../metadata/task_info.json", "w") as f:
    json.dump(new_task_info, f, indent=4)

In [22]:
with open("../metadata/task_info.json", "r") as f:
    new_task_info2 = json.load(f)

In [23]:
new_task_info == new_task_info2

True

In [None]:
print(new_task_info['Acute_Myeloid_Leukemia'])

In [None]:
thres = 795  # tried out
selected = rel[(rel.index < thres) | rel['Condition'].isin(condition_all)]
selected

In [None]:
selected[-10:]

In [None]:
selected[(selected['Trait'].isin(cancer_traits)) & (selected['Condition'].isin(cancer_traits))]

In [None]:
selected[selected['Condition'] == 'Age']

In [None]:
selected[selected['Condition'] == 'Gender']

In [None]:
top1k = rel.iloc[:1000]
top1k

In [None]:
#rel[rel['Trait'] == 'Longevity and Aging' & rel['Condition'] == 'Age']

In [None]:
#rel[(rel['Trait'] == 'Longevity and Aging') & (rel['Condition'] == 'Age')]

In [None]:
top1k[top1k['Condition'] == 'Obesity']

In [None]:
top1k[top1k['Condition'] == 'Hypertension']

In [None]:
top1k[top1k['Condition'] == 'Bipolar disorder']

In [None]:
top1k[top1k['Condition'] == 'Anxiety disorder']

In [None]:
top1k

In [None]:
all = pd.read_csv('new_task.csv')
all_cancers = all.loc[all["Type"] == "1. Cancer and Oncology-Related Disorders", 'Trait'].tolist()
all_cancers

In [None]:
cancer_traits = [c for c in traits if c in all_cancers]
len(cancer_traits)

In [None]:
print(cancer_traits)

In [None]:
top1k[top1k['Trait'].isin(cancer_traits) | top1k['Condition'].isin(cancer_traits)]

In [None]:
top1k[top1k['Trait'].isin(cancer_traits) & top1k['Condition'].isin(cancer_traits)]

In [None]:
c"""for file in sorted_file_names:
    file_path = os.path.join('downloaded_ncbi_genes', file)
    with open(file_path, 'r') as f:
        if "<!-- download operation failed -->" in f.read():
            print(file)"""

In [None]:
df = pd.read_excel("Trait_Condition_Pairings_v2.xlsx", sheet_name="trait and condition")

In [None]:
df.columns

In [None]:
traits = pd.read_csv("latest_task.csv")['Trait'].tolist()
len(traits)

In [None]:
traits.remove('Vitamin D Levels')
len(traits)

In [None]:
values = traits + ['Age', 'Gender']

In [None]:
rel = df.loc[df[843] > 2, ['Trait', 'Condition']]
rel = rel.drop_duplicates(keep='last')
rel

In [None]:
rel[~rel['Trait'].isin(traits) & ~rel['Condition'].isin(values)]

In [None]:
df = pd.read_excel("Trait_Condition_Pairings_v2.xlsx", sheet_name="clusters")
df.isin(traits)