In [2]:
import requests
import pandas as pd
from pprint import pprint
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 999

### Load FA genes

In [5]:
# get gene sets from github
url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt"
fa_genes = pd.read_csv(url, sep='\t', names=['ncbi', 'hgnc'])
fa_genes.head()

Unnamed: 0,ncbi,hgnc
0,NCBIGene:2175,FANCA
1,NCBIGene:2187,FANCB
2,NCBIGene:2176,FANCC
3,NCBIGene:2178,FANCE
4,NCBIGene:2188,FANCF


In [6]:
# For the sake of completeness, we'll look and see if clinvar has anything interesting on the FA genes anyways

In [7]:
# Download is an xml that I don't want to parse. I found someone who already did that here:
# https://github.com/macarthur-lab/clinvar/blob/master/output/b38/single/clinvar_alleles.single.b38.tsv.gz

In [32]:
clinvar = pd.read_csv("clinvar_alleles.single.b38.tsv", sep='\t', low_memory=False)
print("number of records: {}".format(len(clinvar)))

number of records: 293882


In [33]:
# example trait from above. get the "clinical significance"
clinvar[clinvar.hgvs_c == "NM_005957.4:c.1286A>C"].clinical_significance

1772    drug response
Name: clinical_significance, dtype: object

In [34]:
# sanity check to get variants in fa genes (not drug response, just, any)
print(len(clinvar[clinvar.symbol.isin(set(fa_genes.hgnc))]))

17511


In [35]:
# filter to only include those with a phenotype that contains the string "drug"
clinvar_drug = clinvar[clinvar.clinical_significance.str.lower().str.count("drug")>0]
print(len(clinvar_drug))
clinvar_drug.head(2)

332


Unnamed: 0,chrom,pos,ref,alt,start,stop,strand,variation_type,variation_id,rcv,...,all_pmids,inheritance_modes,age_of_onset,prevalence,disease_mechanism,origin,xrefs,dates_ordered,gold_stars,conflicted
1772,1,11794419,T,G,11794419,11794419,-,Variant,3521,RCV000003698;RCV000003699;RCV000144922;RCV0001...,...,10677336;10958762;11590551;11742092;11752418;1...,Autosomal dominant inheritance;Autosomal unknown,Adolescent;Infancy,1-5 / 10 000,,germline,Genetic Alliance:MTHFR+deficiency%2C+thermolab...,2008-07-01;0000-00-00;2015-07-23;2016-06-14;20...,3,0
1805,1,11796321,G,A,11796321,11796321,-,Variant,3520,RCV000003697;RCV000144921;RCV000153516;RCV0002...,...,10196703;10323741;10440833;10732818;10869114;1...,Autosomal dominant inheritance;Autosomal unknown,Adolescent;Infancy,1-5 / 10 000;Gastric cancer is the 4th most fr...,gain of function,germline;somatic,Genetic Alliance:MTHFR+deficiency%2C+thermolab...,2017-02-28;0000-00-00;2015-05-12;2016-06-14;20...,3,0


In [36]:
clinvar_drug.clinical_significance.value_counts()

drug response                                                  261
Pathogenic, drug response                                       26
Uncertain significance, drug response                           16
Likely pathogenic, drug response                                11
Pathogenic/Likely pathogenic, drug response                      5
drug response, other                                             4
Conflicting interpretations of pathogenicity, drug response      4
Benign, drug response                                            1
Benign, drug response, risk factor                               1
Likely benign, drug response                                     1
Benign/Likely benign, drug response, risk factor                 1
Benign/Likely benign, drug response                              1
Name: clinical_significance, dtype: int64

In [37]:
# filter to only include variants in fa genes
clinvar_drug_fa = clinvar_drug[clinvar_drug.symbol.isin(set(fa_genes.hgnc))]
print(len(clinvar_drug_fa))

0
