# Stage 1
****

In [1]:
# This script is the first stage of the project CSV2VCF-converter

# In this first stage, we pretend to write a code to be able to identify 4 types of ID variants from a CSV input.

# The 4 types of ID variants are Transcript(:c.), RefSeq Gene (:g.),Protein1 (:p.) and Protein (:p.) 

# We have created 5 CSV files. One per type of ID plus a 5th that contain all types of ID.


In [1]:
import sys
import os
import requests
import chardet
import json
import pandas as pd
import myvariant

### To read input

In [2]:
# To read input

# This code is able to recognise the 4 type of ID variant mention above contained in a CSV file.

CSV_input = '/Users/manolodominguez/Desktop/git-repos/STP_mini_projects/Igor-Manuel/Igor-Manuel/different_input/NM_.csv'


# input to pd data frame

# This is to avoid UnicodeDecodeError: 'utf-8'
with open(CSV_input, 'rb') as f:
    result = chardet.detect(f.read())


df = pd.read_csv(CSV_input, encoding=result['encoding'])

# We will need a extra copy to avoid repetitions
df_draft = df

df

Unnamed: 0,Gene,GRCh38 coordinates,Transcript,Quality,Filter
0,IMPG2,3:101232831,NM_016247.3:c.3183C>G,34/64,PASS
1,BRCA1,17:43067616,NM_007294.3:c.5066T>C,49/131,PASS
2,FBN1,15:48600225,NM_000138.4:c.356G>A,46/99,PASS
3,MYH7,14:23426833,NM_000257.3:c.1988G>A,8/102,NO PASS
4,BORIS,9:133256042,rs56116432,1/200,NO PASS
5,BRCA1,77:43071077,rs1799966,14/85,PASS


### Here we read and found what we want from the CSV


In [3]:
##### Reading CSV file values and looking for variants IDs ######

# Our programme recognise hgvs_notation (e.g AGT:c.803T>C and ENST00000003084:c.1431_1433delTTC) 
# and also dbSNP ID (e.g rs17289390)


# Find Variant Transcript ':c.' in CSV
Transcript = df_draft[df_draft.apply(lambda x:x.str.contains(":c."))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if Transcript.empty == False:
    ind = Transcript.index.to_list()
    vals = list(Transcript.stack().values)
    row2Transcript = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, Transcript  in row2Transcript.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)
        
# Same but now with dbSNP variant ID
rs = df[df.apply(lambda x:x.str.contains("rs\d+"))].dropna(how='all').dropna(axis=1, how='all')

#rs = df_draft[df_draft.apply(lambda x:x.str.contains("rs"))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if rs.empty == False:
    ind = rs.index.to_list()
    vals = list(rs.stack().values)
    row2rs = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, rs  in row2rs.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)



### Let's see if last code worked

In [7]:
row2rs

{4: 'rs56116432', 5: 'rs1799966'}

In [8]:
row2Transcript

{0: 'NM_016247.3:c.3183C>G',
 1: 'NM_007294.3:c.5066T>C',
 2: 'NM_000138.4:c.356G>A',
 3: 'NM_000257.3:c.1988G>A'}

### Now, the request API

In [27]:
row2gene_symbol = dict()
row2start = dict()
server = 'https://rest.ensembl.org/vep/human/hgvs/'

for row, transcript in row2Transcript.items():

    r = requests.get(server+transcript, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    index2gene_symbol[row]=decoded['end']
    index2start[row]=decoded['start']


In [29]:
server = 'https://rest.ensembl.org/vep/human/hgvs/'

for row, transcript in row2Transcript.items():

    r = requests.get(server+transcript, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    s=str(decoded)
    s2=s[s.find("'gene_symbol'")+13:] #13 is the length of "'gene_symbol'"
    s3=s2[s2.find("'")+1:]
    res=s3[:s3.find("',")]
    index2gene_symbol[row]= res

In [30]:
index2gene_symbol

{0: 'IMPG2', 1: 'BRCA1', 2: 'FBN1', 3: 'MYH7'}

In [42]:
from operator import itemgetter
res = list(map(itemgetter('gene_symbol'), decoded))

KeyError: 'gene_symbol'

In [12]:
decoded

{'start': 23426833,
 'strand': -1,
 'input': 'NM_000257.3:c.1988G>A',
 'colocated_variants': [{'strand': 1,
   'start': 23426833,
   'phenotype_or_disease': 1,
   'seq_region_name': '14',
   'id': 'CM993620',
   'allele_string': 'HGMD_MUTATION',
   'end': 23426833},
  {'strand': 1,
   'start': 23426833,
   'clin_sig_allele': 'T:pathogenic/likely_pathogenic;T:pathogenic',
   'allele_string': 'C/T',
   'end': 23426833,
   'phenotype_or_disease': 1,
   'var_synonyms': 'ClinVar::RCV000158822,RCV000162333,RCV000168409,VCV000042875,RCV000035758,RCV000253409,RCV000477919,RCV000762924,RCV000678721,RCV001170503--PhenCode::FHC0137--Uniprot::VAR_019855',
   'id': 'rs371898076',
   'frequencies': {'A': {'gnomad_sas': 0,
     'gnomad_nfe': 1.758e-05,
     'aa': 0,
     'gnomad_eas': 0,
     'gnomad_oth': 0,
     'gnomad': 7.953e-06,
     'gnomad_afr': 0,
     'gnomad_amr': 0,
     'ea': 0.0001163,
     'gnomad_fin': 0,
     'gnomad_asj': 0}},
   'seq_region_name': '14',
   'pubmed': [24033266,
    

In [19]:

decoded['start']

101232831

In [43]:
len(decoded)

1