# Stage 1
****

In [36]:
# This script is the first stage of the project CSV2VCF-converter

# In this first stage, we pretend to write a code to be able to identify 
# different types of ID variants from a CSV input.

# Then, obtain data using this ID in a Request API

# Finally, generate an df with the data neccesary to write a VCF file.



In [1]:
import sys
import os
import requests
import chardet
import json
import pandas as pd


### To read input

In [145]:
# To read input

# This code is able to recognise the 4 type of ID variant mention above contained in a CSV file.

CSV_input = '/Users/manolodominguez/Desktop/git-repos/STP_mini_projects/Igor-Manuel/Igor-Manuel/different_input/NM_.csv'

## The row 4 and 5th contain errors

# input to pd data frame

# This is to avoid UnicodeDecodeError: 'utf-8'
with open(CSV_input, 'rb') as f:
    result = chardet.detect(f.read())

# This not work 100% of the times. To avoid UnicodeDecodeError: 'utf-8'
# We strongly recommend that the input be .csv

df = pd.read_csv(CSV_input, encoding=result['encoding'])

# We will need a extra copy to avoid repetitions
df_draft = df

print(' The row 4 and 5th contain errors')

df

 The row 4 and 5th contain errors


Unnamed: 0,Gene,GRCh38 coordinates,Transcript,Quality,Filter
0,IMPG2,3:101232831,NM_016247.3:c.3183C>G,34/64,PASS
1,BRCA1,17:43067616,NM_007294.3:c.5066T>C,49/131,PASS
2,FBN1,15:48600225,NM_000138.4:c.356G>A,46/99,PASS
3,MYH7,14:23426833,NM_000257.3:c.1988G>A,8/102,NO PASS
4,BORIS,9:133256042,rs56116432,1/200,NO PASS
5,BRCA1,77:43071077,rs1799966,14/85,PASS


### Here we read and found what we want from the CSV


In [3]:
##### Reading CSV file values and looking for variants IDs ######

# Our programme recognise hgvs_notation (e.g AGT:c.803T>C and ENST00000003084:c.1431_1433delTTC) 
# and also dbSNP ID (e.g rs17289390)


# Find Variant Transcript ':c.' in CSV
Transcript = df_draft[df_draft.apply(lambda x:x.str.contains(":c."))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if Transcript.empty == False:
    ind = Transcript.index.to_list()
    vals = list(Transcript.stack().values)
    row2Transcript = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, Transcript  in row2Transcript.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)
        
# Same but now with dbSNP variant ID
rs = df[df.apply(lambda x:x.str.contains("rs\d+"))].dropna(how='all').dropna(axis=1, how='all')

#rs = df_draft[df_draft.apply(lambda x:x.str.contains("rs"))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if rs.empty == False:
    ind = rs.index.to_list()
    vals = list(rs.stack().values)
    row2rs = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, rs  in row2rs.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)



### Let's see if last code worked

In [4]:
row2rs

{4: 'rs56116432', 5: 'rs1799966'}

In [5]:
row2Transcript

{0: 'NM_016247.3:c.3183C>G',
 1: 'NM_007294.3:c.5066T>C',
 2: 'NM_000138.4:c.356G>A',
 3: 'NM_000257.3:c.1988G>A'}

### Now, the request API

In [97]:
row2gene_symbol = dict()
row2CHROM = dict()
row2POS = dict()
row2ALT = dict()
row2REF = dict()


server = 'https://rest.ensembl.org/vep/human/hgvs/'

for row, transcript in row2Transcript.items():

    r = requests.get(server+transcript, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    
# For row2gene_symbol
    row2gene_symbol[row]= decoded["transcript_consequences"][0]['gene_symbol']

# For row2CHROM
    row2CHROM[row] = decoded["seq_region_name"]
    
# For row2POS 
    row2POS[row] = decoded["start"]
    
# For row2ALT 
    row2ALT[row] = decoded["transcript_consequences"][0]['variant_allele']

# For row2REF
    row2REF[row] = decoded['allele_string'][0]

#### Let's see if the ditc work

In [104]:
row2gene_symbol

{0: 'IMPG2', 1: 'BRCA1', 2: 'FBN1', 3: 'MYH7', 4: 'MYH7', 5: 'MYH7'}

In [105]:
row2CHROM

{0: '3', 1: '17', 2: '15', 3: '14', 4: '14', 5: '14'}

In [106]:
row2POS

{0: 101232831, 1: 43067616, 2: 48600225, 3: 23426833, 4: 23426833, 5: 23426833}

In [107]:
row2ALT

{0: 'G', 1: 'C', 2: 'A', 3: 'A', 4: 'A', 5: 'A'}

In [108]:
row2REF

{0: 'C', 1: 'T', 2: 'G', 3: 'G', 4: 'G', 5: 'G'}

In [129]:
# Now, we need to do the same with row2rs

server = 'https://rest.ensembl.org/vep/human/hgvs/'

for row, rs in row2rs.items():

    r = requests.get(server+transcript, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    
# For row2gene_symbol
    row2gene_symbol[row]= decoded["transcript_consequences"][0]['gene_symbol']

# For row2CHROM
    row2CHROM[row] = decoded["seq_region_name"]
    
# For row2POS 
    row2POS[row] = decoded["start"]
    
# For row2ALT 
    row2ALT[row] = decoded["transcript_consequences"][0]['variant_allele']

# For row2REF
    row2REF[row] = decoded['allele_string'][0]

In [130]:
row2gene_symbol

{0: 'IMPG2', 1: 'BRCA1', 2: 'FBN1', 3: 'MYH7', 4: 'MYH7', 5: 'MYH7'}

In [131]:
row2CHROM

{0: '3', 1: '17', 2: '15', 3: '14', 4: '14', 5: '14'}

In [132]:
row2POS

{0: 101232831, 1: 43067616, 2: 48600225, 3: 23426833, 4: 23426833, 5: 23426833}

In [133]:
row2ALT

{0: 'G', 1: 'C', 2: 'A', 3: 'A', 4: 'A', 5: 'A'}

In [134]:
row2REF

{0: 'C', 1: 'T', 2: 'G', 3: 'G', 4: 'G', 5: 'G'}

#### This looks ok

In [139]:
# To finish this stage, 
# Let's save our dicts in a df

column_names = ["Gene", 'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO']
df_stage1 = pd.DataFrame(columns = column_names)

In [148]:
df_stage1 = pd.DataFrame.from_dict(dict(zip(column_names, [row2gene_symbol, row2CHROM, row2POS,df["Transcript"] ,row2ALT,row2REF, df["Quality"], df["Filter"]  ])))
df_stage1

Unnamed: 0,Gene,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,IMPG2,3,101232831,NM_016247.3:c.3183C>G,G,C,34/64,PASS
1,BRCA1,17,43067616,NM_007294.3:c.5066T>C,C,T,49/131,PASS
2,FBN1,15,48600225,NM_000138.4:c.356G>A,A,G,46/99,PASS
3,MYH7,14,23426833,NM_000257.3:c.1988G>A,A,G,8/102,NO PASS
4,MYH7,14,23426833,rs56116432,A,G,1/200,NO PASS
5,MYH7,14,23426833,rs1799966,A,G,14/85,PASS


# End of stage 1