# Stage 1
****

In [36]:
##################################################################

# Welcome to the script CSV2VCF-converter                        #
# A assessment of the module of Introduction to Programming      #
# By Igor Malashchuk and Manuel Dominguez                        #

##################################################################

# This programme has been split in a few stages we explain here.
# The code is full of comments in which we explain line by line
# what we are doing, however we explain a general explanation
# plus details stage by stage



# This script converts variants stored in a Comma Separated Values (CSV) file
# into Variant Call Format (VCF)

# STAGE 1

## I) Identify 3 (if time maybe more) different ID Variants from a CSV file

### Owing to different clinical scientists use a slightly different CSV
### We have tried to create a code as flexible as possible
### but a we have established minimum requirements we ask here.
### The gene symbol and the location of your variant
### must be the first and second column respectively of your CSV file.
### This is easy to do using Microsoft Excel.
### Our code recognise 3 different Variant ID:

### AGT:c.803T>C
### rs17289390
### ENST00000003084:c.1431_1433delTTC (We need to check this)


### We have test our script and this support Single Nucleotide Variants, 
### insertion variants and deletion Variants. 

### If your CSV have more than one of the mentioned variant ID in the same row
### The script will select the first found, starting from the left
### After the first, following variants ID will be ignore. 

### In Stage 1 we have consider the possibility of getting errors in your variant ID.
### If this happen, the API requested will provide an error
### We have solved this issue with one exception


# STAGE 2

### We have noticed that the CSV used for years are not uptodate
### We will compare for you if your gene symbol and location are correct
### This script compares your gene symbol and variant with the equivalent provided by the API.
### And creates a txt file informing you error or mismatchs found.
### The VCF generate will used directly the gene symbol and the location from the API

# STAGE 3 

### This stage creates the VCF file according to documentation (VCFv4.3 format)
### https://samtools.github.io/hts-specs/VCFv4.2.pdf


# STAGE 4

### because this is a collaborative project between two people
### we have used Jupyter Notebooks and Github.
### This stage consists of creating a script that performs the same function using the terminal.


# Stage 5

### The last stage consists of creating a interface that perfoms the same execution that stage 4.


In [1]:
# Let's import all modules used in this script:
import sys
import os
import requests
import chardet
import json
import pandas as pd
import numpy as np
import os.path
from datetime import date

In [10]:
your_path = os.getcwd()
folder = '/input'

In [11]:
file_list = os.listdir(your_path+folder)
file_list

['NM_.csv']

In [200]:
# To read input

# This code is able to recognise the 3 type of ID variant mention above contained in a CSV file.

CSV_input = '/Users/manolodominguez/Desktop/git-repos/STP_mini_projects/Igor-Manuel/Inputs/NM_.csv'

# input to pd data frame

# This is to avoid UnicodeDecodeError: 'utf-8'
with open(CSV_input, 'rb') as f:
    result = chardet.detect(f.read())

# This not work 100% of the times. To avoid UnicodeDecodeError: 'utf-8'
# We strongly recommend that the input be .csv

df = pd.read_csv(CSV_input, encoding=result['encoding'])

# We will need a extra copy to avoid repetitions
df_draft = df


df

Unnamed: 0,Gene,GRCh38 coordinates,Transcript,Quality,Filter
0,IMPG2,3:101232831,NM_016247.3:c.3183C>G,34/64,PASS
1,BRCA1,17:43067616,NM_007294.3:c.5066T>C,49/131,PASS
2,FBN1,15:48600225,NM_000138.4:c.356G>A,46/99,PASS
3,MYH7,14:23426833,NM_000257.3:c.1988G>A,8/102,NO PASS
4,BORIS,9:133256042,rs56116432,1/200,NO PASS
5,BRCA1,77:43071077,rs1799966,14/85,PASS
6,CFTR,7:117199558,ENST00000003084:c.1431_1433delTTC,99/100,PASS
7,CFTR,7:117559502,ENST00000003084:c.1431_1433delTTC,99/100,PASS
8,HLA-C,6:31271394,NM_002117.5:c.344-46_344-45insT,34/64,PASS
9,HLA-C,6:31271394,rs9281300,34/64,PASS


### Here we read and found what we want from the CSV


In [201]:
##### Reading CSV file values and looking for variants IDs ######

# Our programme recognise hgvs_notation (e.g AGT:c.803T>C and ENST00000003084:c.1431_1433delTTC) 
# and also dbSNP ID (e.g rs17289390)


# Find Variant Transcript ':c.' in CSV
Transcript = df_draft[df_draft.apply(lambda x:x.str.contains(":c."))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if Transcript.empty == False:
    ind = Transcript.index.to_list()
    vals = list(Transcript.stack().values)
    row2Transcript = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, Transcript  in row2Transcript.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)
        
# Same but now with dbSNP variant ID
rs = df[df.apply(lambda x:x.str.contains("rs\d+"))].dropna(how='all').dropna(axis=1, how='all')

#rs = df_draft[df_draft.apply(lambda x:x.str.contains("rs"))].dropna(how='all').dropna(axis=1, how='all')

# Now, we save the results found in a dict key=index and value=variand ID
if rs.empty == False:
    ind = rs.index.to_list()
    vals = list(rs.stack().values)
    row2rs = dict(zip(ind, vals))
    # We need to remove the row where rs has been found to avoid repetitions
    # In case in same row more than one kind of ID Variant is stored ('e.g :c.' and '(:p.)')   
    for index, rs  in row2rs.items(): 
        # This will be done in df_draft
        df_draft = df_draft.drop(index)


        
# df_draft empty means every row contained a rs or :c. ID
print('Is the DataFrame empty? ', df_draft.empty)

Is the DataFrame empty?  True


### Let's see if last code worked

In [202]:
row2rs

{4: 'rs56116432', 5: 'rs1799966', 9: 'rs9281300', 11: 'rs1799758'}

In [203]:
row2Transcript

{0: 'NM_016247.3:c.3183C>G',
 1: 'NM_007294.3:c.5066T>C',
 2: 'NM_000138.4:c.356G>A',
 3: 'NM_000257.3:c.1988G>A',
 6: 'ENST00000003084:c.1431_1433delTTC',
 7: 'ENST00000003084:c.1431_1433delTTC',
 8: 'NM_002117.5:c.344-46_344-45insT',
 10: 'NM_000237.2:c.953A>G'}

### Now, the request API

In [318]:
row2gene_symbol = dict()
row2CHROM = dict()
row2POS = dict()
row2ALT = dict()
row2REF = dict()


server = 'https://rest.ensembl.org/vep/human/hgvs/'

for row, transcript in row2Transcript.items():

    r = requests.get(server+transcript, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    
# For row2gene_symbol
    row2gene_symbol[row]= decoded["transcript_consequences"][0]['gene_symbol']

# For row2CHROM
    row2CHROM[row] = decoded["seq_region_name"]
    
# For row2POS 
    row2POS[row] = decoded["start"]
    
# For row2ALT 
    row2ALT[row] = decoded['allele_string'].split('/')[1]
# This also work: decoded["transcript_consequences"][0]['variant_allele']

# For row2REF
    row2REF[row] = decoded['allele_string'].split('/')[0]
    
########################################################################    
# Here is where we need to put some exception if variant ID is unknown #
########################################################################

#### Let's see if the ditc work

In [319]:
row2gene_symbol

{0: 'IMPG2',
 1: 'BRCA1',
 2: 'FBN1',
 3: 'MYH7',
 6: 'CFTR',
 7: 'CFTR',
 8: 'HLA-C',
 10: 'LPL'}

In [320]:
row2CHROM

{0: '3', 1: '17', 2: '15', 3: '14', 6: '7', 7: '7', 8: '6', 10: '8'}

In [321]:
row2POS

{0: 101232831,
 1: 43067616,
 2: 48600225,
 3: 23426833,
 6: 117559502,
 7: 117559502,
 8: 31271394,
 10: 19956018}

In [322]:
row2ALT

{0: 'G', 1: 'C', 2: 'A', 3: 'A', 6: '-', 7: '-', 8: 'T', 10: 'G'}

In [323]:
row2REF

{0: 'C', 1: 'T', 2: 'G', 3: 'G', 6: 'TTC', 7: 'TTC', 8: '-', 10: 'A'}

In [324]:
# Now, we need to do the same with row2rs

server = "https://rest.ensembl.org/vep/human/id/"

interrogation = '?'

for row, rs in row2rs.items():

    r = requests.get(server+rs+interrogation, headers={ "Content-Type" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()[0]
    
    
# For row2gene_symbol
    row2gene_symbol[row]= decoded['transcript_consequences'][0]['gene_symbol']

# For row2CHROM
    row2CHROM[row] = decoded['seq_region_name']
    
# For row2POS 
    row2POS[row] = decoded['start']
    
# For row2ALT 
    row2ALT[row] = decoded['allele_string'].split('/')[1]

# For row2REF
    row2REF[row] = decoded['allele_string'].split('/')[0]

In [325]:
row2gene_symbol

{0: 'IMPG2',
 1: 'BRCA1',
 2: 'FBN1',
 3: 'MYH7',
 6: 'CFTR',
 7: 'CFTR',
 8: 'HLA-C',
 10: 'LPL',
 4: 'ABO',
 5: 'BRCA1',
 9: 'HLA-C',
 11: 'TSC2'}

In [326]:
row2CHROM

{0: '3',
 1: '17',
 2: '15',
 3: '14',
 6: '7',
 7: '7',
 8: '6',
 10: '8',
 4: '9',
 5: '17',
 9: '6',
 11: '16'}

In [327]:
row2POS

{0: 101232831,
 1: 43067616,
 2: 48600225,
 3: 23426833,
 6: 117559502,
 7: 117559502,
 8: 31271394,
 10: 19956018,
 4: 133256042,
 5: 43071077,
 9: 31271394,
 11: 2088197}

In [328]:
row2ALT

{0: 'G',
 1: 'C',
 2: 'A',
 3: 'A',
 6: '-',
 7: '-',
 8: 'T',
 10: 'G',
 4: 'T',
 5: 'A',
 9: 'A',
 11: 'AG'}

In [329]:
row2REF

{0: 'C',
 1: 'T',
 2: 'G',
 3: 'G',
 6: 'TTC',
 7: 'TTC',
 8: '-',
 10: 'A',
 4: 'C',
 5: 'T',
 9: '-',
 11: 'AGTGAG'}

#### This looks ok

In [330]:
# To finish this stage, 
# Let's save our dicts in a df

column_names = ["Gene_API", 'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO']
df_stage1 = pd.DataFrame(columns = column_names)

In [331]:
df_stage1 = pd.DataFrame.from_dict(dict(zip(column_names, [row2gene_symbol, row2CHROM, row2POS,df["Transcript"],row2REF ,row2ALT, df["Quality"], df["Filter"]  ])))
df_stage1

Unnamed: 0,Gene_API,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,IMPG2,3,101232831,NM_016247.3:c.3183C>G,C,G,34/64,PASS
1,BRCA1,17,43067616,NM_007294.3:c.5066T>C,T,C,49/131,PASS
2,FBN1,15,48600225,NM_000138.4:c.356G>A,G,A,46/99,PASS
3,MYH7,14,23426833,NM_000257.3:c.1988G>A,G,A,8/102,NO PASS
4,ABO,9,133256042,rs56116432,C,T,1/200,NO PASS
5,BRCA1,17,43071077,rs1799966,T,A,14/85,PASS
6,CFTR,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS
7,CFTR,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS
8,HLA-C,6,31271394,NM_002117.5:c.344-46_344-45insT,-,T,34/64,PASS
9,HLA-C,6,31271394,rs9281300,-,A,34/64,PASS


# End of stage 1

# Stage 2
****

In [332]:
# In Stage 2 we compare Gene and location between input and API Emsembl.
# And we create a txt file to provide the errors/mismachts found.


In [333]:
# First, let's prepare where our information file is going to be create.

# Introce in path the directory where we wish to create the new CVF
save_path = '/Users/manolodominguez/Desktop/git-repos/STP_mini_projects/Igor-Manuel/Outputs'

# Introduce in name, the name of your new file
name_of_file= "inform_error"

# This will  create directory + file name
completeName = os.path.join(save_path, name_of_file+".txt")         

In [334]:
# To compare input with API result
# We need to create a new column from CHROM + ':' + POS

df_stage1 = df_stage1.applymap(str)
df_stage1['Location'] = df_stage1['CHROM'].str.cat(df_stage1['POS'], sep=':')
df_stage1

Unnamed: 0,Gene_API,CHROM,POS,ID,REF,ALT,QUAL,FILTER,Location
0,IMPG2,3,101232831,NM_016247.3:c.3183C>G,C,G,34/64,PASS,3:101232831
1,BRCA1,17,43067616,NM_007294.3:c.5066T>C,T,C,49/131,PASS,17:43067616
2,FBN1,15,48600225,NM_000138.4:c.356G>A,G,A,46/99,PASS,15:48600225
3,MYH7,14,23426833,NM_000257.3:c.1988G>A,G,A,8/102,NO PASS,14:23426833
4,ABO,9,133256042,rs56116432,C,T,1/200,NO PASS,9:133256042
5,BRCA1,17,43071077,rs1799966,T,A,14/85,PASS,17:43071077
6,CFTR,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS,7:117559502
7,CFTR,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS,7:117559502
8,HLA-C,6,31271394,NM_002117.5:c.344-46_344-45insT,-,T,34/64,PASS,6:31271394
9,HLA-C,6,31271394,rs9281300,-,A,34/64,PASS,6:31271394


In [335]:
# Now, we can compare Gene and location of the input againt Gene and location
# from the API

# Let's create a new df for comparison purpose

df_comparation =  pd.concat([df.iloc[:, 0:3], df_stage1[['Gene_API','Location']]], axis=1)
#df_comparation = df_comparation.set_index("Transcript")

In [336]:
df_comparation

Unnamed: 0,Gene,GRCh38 coordinates,Transcript,Gene_API,Location
0,IMPG2,3:101232831,NM_016247.3:c.3183C>G,IMPG2,3:101232831
1,BRCA1,17:43067616,NM_007294.3:c.5066T>C,BRCA1,17:43067616
2,FBN1,15:48600225,NM_000138.4:c.356G>A,FBN1,15:48600225
3,MYH7,14:23426833,NM_000257.3:c.1988G>A,MYH7,14:23426833
4,BORIS,9:133256042,rs56116432,ABO,9:133256042
5,BRCA1,77:43071077,rs1799966,BRCA1,17:43071077
6,CFTR,7:117199558,ENST00000003084:c.1431_1433delTTC,CFTR,7:117559502
7,CFTR,7:117559502,ENST00000003084:c.1431_1433delTTC,CFTR,7:117559502
8,HLA-C,6:31271394,NM_002117.5:c.344-46_344-45insT,HLA-C,6:31271394
9,HLA-C,6:31271394,rs9281300,HLA-C,6:31271394


In [337]:
# New two columns as result of the comparation
df_comparation['result1'] = np.where(df_comparation.iloc[:, 0] == df_comparation.iloc[:, 3], 'OK', 'ERROR')

In [338]:
df_comparation['result2'] = np.where(df_comparation.iloc[:, 1] == df_comparation.iloc[:, 4], 'OK', 'ERROR')

In [339]:
df_comparation

Unnamed: 0,Gene,GRCh38 coordinates,Transcript,Gene_API,Location,result1,result2
0,IMPG2,3:101232831,NM_016247.3:c.3183C>G,IMPG2,3:101232831,OK,OK
1,BRCA1,17:43067616,NM_007294.3:c.5066T>C,BRCA1,17:43067616,OK,OK
2,FBN1,15:48600225,NM_000138.4:c.356G>A,FBN1,15:48600225,OK,OK
3,MYH7,14:23426833,NM_000257.3:c.1988G>A,MYH7,14:23426833,OK,OK
4,BORIS,9:133256042,rs56116432,ABO,9:133256042,ERROR,OK
5,BRCA1,77:43071077,rs1799966,BRCA1,17:43071077,OK,ERROR
6,CFTR,7:117199558,ENST00000003084:c.1431_1433delTTC,CFTR,7:117559502,OK,ERROR
7,CFTR,7:117559502,ENST00000003084:c.1431_1433delTTC,CFTR,7:117559502,OK,OK
8,HLA-C,6:31271394,NM_002117.5:c.344-46_344-45insT,HLA-C,6:31271394,OK,OK
9,HLA-C,6:31271394,rs9281300,HLA-C,6:31271394,OK,OK


In [340]:
# We save here the rows where errors have been in gene
gene_error = df_comparation[df_comparation['result1'].str.match('ERROR')]
gene_error = gene_error.drop(gene_error.columns[[1,4,5,6]], axis=1)

In [341]:
gene_error

Unnamed: 0,Gene,Transcript,Gene_API
4,BORIS,rs56116432,ABO


In [342]:
# we save this information in a dict with the variant where the error has been found as key
# Plus in a list the gene simbol of the input and the gene simbol of the API as value.
gene2gene_API = gene_error.set_index('Transcript').T.to_dict('list')
gene2gene_API

{'rs56116432': ['BORIS', 'ABO']}

In [343]:
# Same operation for location

In [344]:
location_error = df_comparation[df_comparation['result2'].str.match('ERROR')]
location_error = location_error.drop(location_error.columns[[0,3,5,6]], axis=1)
location_error

Unnamed: 0,GRCh38 coordinates,Transcript,Location
5,77:43071077,rs1799966,17:43071077
6,7:117199558,ENST00000003084:c.1431_1433delTTC,7:117559502


In [345]:
variant2locations = location_error.set_index('Transcript').T.to_dict('list')
variant2locations

{'rs1799966': ['77:43071077', '17:43071077'],
 'ENST00000003084:c.1431_1433delTTC': ['7:117199558', '7:117559502']}

In [346]:
# Now with dicts we can write the inform_error.txt
# If both dicts are empty
# then create the txt saying there is not error found
# If one or both dict contain data
# then says the errors found


# If there is not any mismatch found, one document will be created saying 
No_error_found = 'Congratulation, no errors have been found in your CSV with regard to gene symbol and location\n'


# If there are mismathes found, one document will be created saying
Errors_found = 'If you are reading this document is because some error(s) or mismatch(s) has been found between the Gene Symbol and/or the location of your variants\nWhen these has been compared with API Emsembl.\nWe show you the differences found in this document.\n\n'
# Plus the mismatch


# Number of locations mismatches 
how_many_location_error = 'Number of location mismachts found:', str(len(location_error))
how_many_location_error = ' '.join(how_many_location_error)

# Now, show these mismatchs in this order

header_location = 'Your location     variant    API location\n'

# Now the same with gene_symbol

# Number of gene_symbol mismatchs
how_many_GS_error = 'Number of Gene Symbol mismachts found:', str(len(gene_error))
how_many_GS_error = ' '.join(how_many_GS_error)

# Now, show these mismatchs in this order

header_gene_symbol = 'Your Gene_symbol,     Variant,       API Gene_symbol\n'
separator = '\n'

location_title = "############# LOCATION'S MISMACHTS ############## \n\n"
gene_title = "\n\n############# GENE_SYMBOL'S MISMACHTS #############\n\n"

# Here we introduce a condition, if dicts are empty
# write No_error_found
# if dicts are. not empty, write all this information

if bool(variant2locations) == False:
    if bool(gene2gene_API)==False:
        with open(completeName,'w') as out:
            out.writelines([No_error_found])
else:
    # Write location
    
    with open(completeName,'w') as out:
            out.writelines([Errors_found, location_title, how_many_location_error, separator, header_location])   
    location_error.to_csv(completeName,
          header=None, index=None, sep='\t', mode='a')

    
    # Now, with Gene_symbol    
    
    with open(completeName,'a') as out:
        out.writelines([gene_title, how_many_GS_error, separator,header_gene_symbol])
    gene_error.to_csv(completeName,
          header=None, index=None, sep='\t', mode='a')

# End of stage 2

The output generated

If you are reading this document is because some error(s) or mismatch(s) has been found between the Gene Symbol and/or the location of your variants
When these has been compared with API Emsembl.
We show you the differences found in this document.

############# LOCATION'S MISMACHTS ############## 

Number of location mismachts found: 1
Your location     variant    API location
77:43071077	rs1799966	17:43071077


############# GENE_SYMBOL'S MISMACHTS #############

Number of Gene Symbol mismachts found: 1
Your Gene_symbol,     Variant,       API Gene_symbol
BORIS	rs56116432	ABO


# Stage 3
****

In [347]:
# In this stage we create the VCF file

In [348]:
# The VCF follow a structure explained in https://samtools.github.io/hts-specs/VCFv4.2.pdf
# We have created the minimum structure that a VCF document must have 
# according to documentation (VCFv4.3 format)


# An example here

#      ##fileformat=VCFv4.3
#      #CHROM POS      ID         REF   ALT    QUAL  FILTER   INFO                             
#      20     14370    rs6054257  G     A      29    PASS    NS=3;DP=14;AF=0.5;DB;H2



# Here we explain how we are going to fill the columns of the  VCF file.

### Mandatory columns ###
# CHROM taken from df_stage1['CHROM']
# POS taken from df_stage1['POS']
# ID taken df_stage1['ID']
# REF taken from df_stage1['REF']
# ALT taken from df_stage1['ALT']
# QUAL taken from df_stage1['QUAL']
# FILTER taken from df_stage1['FILTER']
# INFO filled with (‘.’)


In [349]:
# First, let's create a empty document in the same directory

# Introce in path the directory where we wish to create the new CVF
save_path = '/Users/manolodominguez/Desktop/git-repos/STP_mini_projects/Igor-Manuel/Outputs'

# Introduce in name, the name of your new file
name_of_file2 = "The_VCF_file"

# This will  create directory + file name
completeName2 = os.path.join(save_path, name_of_file2+".vcf")         


# Now, let's create the metadata information
file_format = '##fileformat=VCFv4.3\n'
today = date.today()
d1 = today.strftime("%Y%m%d")
file_Date = '##fileDate='+d1+'\n'

# And now the header of the columns
header_line = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'

In [350]:
# Now, we write meta-information line and header line
with open(completeName2,'w') as out:
    out.writelines([file_format, file_Date,header_line ])

In [351]:
# Let's modify the df_stage1 to be added to the VCF file
df_stage2 = df_stage1
df_stage2['INFO'] = '.'

df_stage2 = df_stage2[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO' ,'Gene_API', 'Location']]
df_stage2

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,Gene_API,Location
0,3,101232831,NM_016247.3:c.3183C>G,C,G,34/64,PASS,.,IMPG2,3:101232831
1,17,43067616,NM_007294.3:c.5066T>C,T,C,49/131,PASS,.,BRCA1,17:43067616
2,15,48600225,NM_000138.4:c.356G>A,G,A,46/99,PASS,.,FBN1,15:48600225
3,14,23426833,NM_000257.3:c.1988G>A,G,A,8/102,NO PASS,.,MYH7,14:23426833
4,9,133256042,rs56116432,C,T,1/200,NO PASS,.,ABO,9:133256042
5,17,43071077,rs1799966,T,A,14/85,PASS,.,BRCA1,17:43071077
6,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS,.,CFTR,7:117559502
7,7,117559502,ENST00000003084:c.1431_1433delTTC,TTC,-,99/100,PASS,.,CFTR,7:117559502
8,6,31271394,NM_002117.5:c.344-46_344-45insT,-,T,34/64,PASS,.,HLA-C,6:31271394
9,6,31271394,rs9281300,-,A,34/64,PASS,.,HLA-C,6:31271394


In [352]:
df_stage2.to_csv(completeName2,
          header=None, index=None, sep='\t', mode='a')