In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
#UTSW Proxies
proxies = {"http": "http://proxy.swmed.edu:3128",
           "https": "https://proxy.swmed.edu:3128"}

In [3]:
#Get Oncotree codes
response = requests.get("http://oncotree.mskcc.org/api/tumorTypes?version=oncotree_latest_stable", proxies=proxies)
r = response.text
df = pd.read_json(r)

#format
oncotree_df = df.rename({'code': 'CODE', 'mainType': 'CANCER_TYPE', 'name': 'CANCER_TYPE_DETAILED', 'parent': 'Parent'},
                        axis='columns')

for i in range(len(df)):
    try:
        oncotree_df.loc[i, 'NCI'] = df['externalReferences'][i]['NCI'][0]
    except:
        oncotree_df.loc[i, 'NCI'] = None

oncotree_df = oncotree_df[['CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'CODE', 'NCI', 'Parent']]
oncotree_df = oncotree_df.set_index('CODE')
oncotree_df

Unnamed: 0_level_0,CANCER_TYPE,CANCER_TYPE_DETAILED,NCI,Parent
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TLGL,Mature T and NK Neoplasms,T-Cell Large Granular Lymphocytic Leukemia,C4664,MTNN
HDCN,Histiocytosis,Histiocytic and Dendritic Cell Neoplasms,C3106,MNM
DNT,Glioma,Dysembryoplastic Neuroepithelial Tumor,C9505,ENCG
SCEMU,Cervical Cancer,Signet Ring Mucinous Carcinoma,C40205,CEMU
DESM,Melanoma,Desmoplastic Melanoma,C37257,MEL


In [62]:
#Drop instances where same NCI code applies to more than one tumor
NCI_df = oncotree_df[pd.notnull(oncotree_df.NCI)]
NCI_df.drop(NCI_df.groupby('NCI').filter(lambda x: len(x)>1).index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [36]:
#Scrape NCI synonyms page
base_url = 'https://ncit.nci.nih.gov/ncitbrowser/pages/concept_details.jsf?dictionary=NCI_Thesaurus&code='
suffix = '&type=synonym'

syn_df = pd.DataFrame()

for idx in NCI_df.index:
    result = requests.get(base_url + NCI_df.loc[idx, 'NCI'] + suffix, proxies=proxies)
    df = pd.read_html(result.content, header=0)[9]
    NCI_Syns = df[df.Source=='NCI']
    final_df = NCI_Syns['Term'].to_frame()
    final_df['Oncotree'] = idx
    syn_df = pd.concat([syn_df, final_df])
    
syn_df = syn_df.reset_index(drop=True)

In [37]:
#Drop duplicates and instances where the synonym is the same as the Oncotree code
trim_df = syn_df.drop_duplicates()
trim_df = trim_df[trim_df.Term !=  trim_df.Oncotree]
trim_df.head()

Unnamed: 0,Term,Oncotree
0,Large Cell Granular Lymphogenous Leukemia,TLGL
1,Large Cell Granular Lymphoid Leukemia,TLGL
2,Large Granular Lymphocytic Leukemia,TLGL
3,Large Granular Lymphocytosis,TLGL
4,LGLL,TLGL


In [78]:
Oncotree_trim = oncotree_df['CANCER_TYPE_DETAILED'].reset_index()
Oncotree_trim.columns = ['Oncotree', 'Term']
Oncotree_trim.head()

Unnamed: 0,Oncotree,Term
0,TLGL,T-Cell Large Granular Lymphocytic Leukemia
1,HDCN,Histiocytic and Dendritic Cell Neoplasms
2,DNT,Dysembryoplastic Neuroepithelial Tumor
3,SCEMU,Signet Ring Mucinous Carcinoma
4,DESM,Desmoplastic Melanoma


In [79]:
Synonyms_df = pd.concat([trim_df, Oncotree_trim], sort=True)
Synonyms_df = Synonyms_df.reset_index(drop=True)
Synonyms_df = Synonyms_df.drop_duplicates()

In [80]:
Synonyms_df.Term.is_unique

False

In [83]:
repeat_df = Synonyms_df.groupby('Term').filter(lambda x: len(x)>1)
repeat_df

Unnamed: 0,Oncotree,Term
647,MCHSCNS,Mesenchymal Chondrosarcoma
879,SFT,Hemangiopericytoma
974,GB,Glioblastoma Multiforme
1236,GS,Malignant Glomus Tumor
1419,HDCS,Histiocytic Sarcoma
1595,VIMT,Immature Teratoma
1596,VIMT,Malignant Teratoma
1649,STMYEC,Myoepithelial Carcinoma
1696,MPC,Hemangiopericytoma
1714,DCS,Histiocytic and Dendritic Cell Neoplasms


In [84]:
#Those examples are almost all too general for the oncotree code, just get rid of them
Synonyms_df.drop(repeat_df.index, inplace=True)

In [None]:
Synonyms_df.to_csv('Cancer_Synonyms_Final.csv')