In [None]:
from Bio import Entrez
Entrez.email = "gsyu93@gmail.com"

handle = Entrez.esearch(db="clinvar", term='TP53')
result = Entrez.read(handle)
handle.close()
# records = Entrez.parse(handle)

In [1]:
from genet import database as db

Please enter your email


In [2]:
cv_record = db.GetClinVar('VCV000428864.3')
pecv_score(cv_record)

In [3]:
type(cv_record)

genet.database.functional.GetClinVar

In [4]:
cv_record.seq()

('GGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGAGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGC',
 'GGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGCAG')

In [7]:
cv_record.alt_len

2

In [10]:
import genet
def pecv_score(cv_record:genet.database.functional.GetClinVar,
               sID:str       = 'Sample',
               pe_system:str = 'PE2max',
               cell_type:str = 'HEK293T',
               pbs_min:int   = 7,
               pbs_max:int   = 15,
               rtt_max:int   = 40
               ):

    '''
    database module에서 GetClinVar에서 가져온 variants record를 이용.\n
    DeepPrime에 따로 sequence input을 가져올 필요 없이 바로 점수를 계산해준다.\n
    만약 DeepPrime에서 예측이 불가능한 형태의 variants면, 메세지를 내보낸다.\n

    
    '''
    print('DeepPrime score of ClinVar record')

    Ref_seq, ED_seq = cv_record.seq()

    nAltIndex   = 60
    pbs_range   = [pbs_min, pbs_max]
    rtt_max     = rtt_max
    pe_system   = pe_system

    edit_type   = cv_record.alt_type
    edit_len    = int(cv_record.alt_len)

    print(Ref_seq)
    print(ED_seq)
    print(edit_type)
    print(edit_len)

In [11]:
pecv_score(cv_record)

DeepPrime score of ClinVar record
GGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGAGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGC
GGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGCAG
del
2


In [None]:
from genet import database as db
from genet import predict as pred

cv_record = db.GetClinVar('VCV000428864.3')
pred.pecv_score(cv_record)

## test

In [7]:
from Bio import Entrez, SeqIO
Entrez.email = "gsyu93@gmail.com"

In [46]:
class _GetClinVar:
    def __init__(self, 
                 record_id:str,
                 ):

        '''
        NCBI ClinVar에서 record를 찾기위한 function.\n
        기본적으로 biopython의 Entrez module을 사용한다. 

        example:
        >>> from genet import database as db
        >>> cv_record = db.GetClinVar('VCV000209223')

        '''

        self._record_id = record_id

        if self._record_id.startswith('VCV'):
            self.handle = Entrez.efetch(db='clinvar', id=self._record_id.split('.')[0], rettype='vcv') # VCV로 받을 경우    
        else:            
            self.handle = Entrez.efetch(db='clinvar', id=self._record_id, rettype='vcv', is_varationid='true', from_esearch="true") # variation ID로 받을 경우
        
        import xml.etree.ElementTree as ET
        self.result = ET.parse(self.handle)
        self.root = self.result.getroot()
        
        self.var_loc = self.root.findall('./VariationArchive/InterpretedRecord/SimpleAllele/Location/SequenceLocation')

        for self.info in self.var_loc:
            if self.info.attrib['Assembly'] == 'GRCh38':
                self.chr_acc = self.info.attrib['Accession']
                self.start   = int(self.info.attrib['start'])
                self.stop    = int(self.info.attrib['stop'])
                self.ref_nt  = self.info.attrib['referenceAlleleVCF']
                self.alt_nt  = self.info.attrib['alternateAlleleVCF']
                self.alt_len = int(self.info.attrib['variantLength'])
                break

        if   len(self.ref_nt) == len(self.alt_nt): self.alt_type = 'sub'
        elif len(self.ref_nt) <  len(self.alt_nt): self.alt_type = 'ins'
        elif len(self.ref_nt) >  len(self.alt_nt): self.alt_type = 'del'
    
    # def __init__: End

    def seq(self, context:int = 60, new_alt:str='atc',):
        '''
        esearch로 가져온 RefSeq의 ID를 받아서, efetch로 정보를 불러온다.
        불러온 정보는 seq_record로 저장되고, 그 안에서 각종 정보를 가져올 수 있다.
        
        '''
        self.chr_seq_fetch = Entrez.efetch(db="nucleotide", 
                                           id=self.chr_acc, 
                                           rettype="fasta", 
                                           strand=1, 
                                           seq_start = self.start-context, 
                                           seq_stop  = self.stop+context+self.alt_len
                                           )

        self.ref_seq = str(SeqIO.read(self.chr_seq_fetch, "fasta").seq)
        if self.alt_type != 'del':
            self.alt_seq = self.ref_seq[:context] + self.alt_nt + self.ref_seq[context+1:]
        else:
            self.alt_seq = self.ref_seq[:context] + self.ref_seq[context+self.alt_len:]
        
        self.chr_seq_fetch.close()

        if self.alt_type == 'ins':
            self.ref_seq = self.ref_seq[1:]
            self.alt_seq = self.alt_seq[1:]

        return self.ref_seq[:1+context*2], self.alt_seq[:1+context*2]


In [47]:
_cv_record = _GetClinVar('VCV000428864.3')
_cv_record.seq(80)

('TAGGGCCAGGAAGGGGCTGAGGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGAGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGCAGCAACAGAGGAGGGGGAGA',
 'TAGGGCCAGGAAGGGGCTGAGGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTTGTTCCAAGGCCTCATTCAGCTCTCGGAACATCTCGAAGCGCTCACGCCCACGGATCTGCAGCAACAGAGGAGGGGGAGAAG')

In [45]:
_cv_record.ref_seq[:80]

'TAGGGCCAGGAAGGGGCTGAGGTCACTCACCTGGAGTGAGCCCTGCTCCCCCCTGGCTCCTTCCCAGCCTGGGCATCCTT'

In [39]:
_cv_record.ref_nt

'TGA'

In [40]:
_cv_record.alt_nt

'T'

In [41]:
_cv_record.alt_len

2