## Annotate MT variants using mseqdr tool

Reference and curl API instructions: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5992054/   
Database: https://mseqdr.org/portal.php?dbsource=genomic&name=m.16127A%3EG&x=0&y=0

In [None]:
# The input vcf path
vcf_f = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/anno_variants/variants.vcf"
outdir = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/anno_variants"

mt_fasta = "/data/Mito_Trace/data/external/GRCh38_MT_blacklist/chrM.fasta" # "example_data/hg19_chr22.fa"
isRawVCF = False

In [None]:
from Bio import SeqIO
import pandas as pd
from os.path import join
import numpy as np

In [None]:
anno_json_f = join(outdir, "vars_anno.json")
varType_out_f = join(outdir, "varsType.tsv")
anno_out_f = join(outdir, "anno_variants.tsv")

In [None]:
## Create proper formatted vcf file

if isRawVCF:
    print("Processing VCF")
    for record in SeqIO.parse(mt_fasta, "fasta"):
        mt_seq = record.seq
    mt_seq

    vcf_path = raw_vcf.replace(".vcf", ".fmt.vcf")

    vcf = pd.read_csv(raw_vcf, sep='\t')
    vcf["QUAL"] = "."
    vcf["FILTER"] = "."
    vcf["INFO"] = "."
    vcf["ID"] = "."
    vcf["REF"] = vcf["REF"].apply(lambda x: x[-1])
    vcf = vcf[["#CHROM" ,"POS","ID","REF","ALT","QUAL","FILTER","INFO"]]
    vcf

    header = "##fileformat=VCFv4.0"
    header = header + "\n" + f"##reference=file:/{mt_fasta}"
    #vcf.to_csv(vcf_path, sep='\t', index=False)
    with open(vcf_path, 'a') as file:
        file.write(header)
        vcf.to_csv(vcf_path, sep='\t', index=False)

else:
    vcf_path = vcf_f

## Get transition-transversion

In [None]:
import src.utils.variant_utils as vu

In [None]:
variants = pd.read_csv(vcf_path, skiprows=2,sep='\t', index_col=None)
variants

#variants["REF"] = [x[:-1] for x in variants["REF"]]

def type_of_variants(variants):
    # Get types of mutations
    def var_type(x):
        nts = set(x[["REF", "ALT"]])
        if "N" in nts:
            return "Undefined"
        if nts == {"A", "G"} or nts == {"T", "C"}:
            return "Transition"
        return "Transversion"
    variants["variant type"] = variants.apply(var_type, axis=1)
    variants["variant change"] = variants["REF"]+">"+variants["ALT"]
    return variants

In [None]:
variants = type_of_variants(variants=variants).set_index("ID")
variants.to_csv(varType_out_f, sep="\t")
#vu.type_of_variants(variants=variants, to_preproc=False)

## Run annotation

In [None]:
cmd = f"curl -s -X POST https://mseqdr.org/mtannotapi.php?format=vcf --data-binary @{vcf_path} -o {anno_json_f}"
print(cmd)

In [None]:
!{cmd}

# Check results

In [None]:
import json
var_ann = json.load(open(anno_json_f))
var_ann

In [None]:
var_ann.keys()

In [None]:
population = pd.DataFrame(var_ann["population"]).set_index("Input")
population
population = pd.DataFrame(var_ann["population"]).set_index("Input")
print(population.shape)
population[population=="-"] = np.nan
population = population.dropna(how="all", axis=1)
print(population.shape)


In [None]:
population = pd.DataFrame(var_ann["population"]).set_index("Input")
print(population.shape)
population[population=="-"] = np.nan
population = population.dropna(how="all", axis=1)
print(population.shape)

dbnsfp = pd.DataFrame(var_ann["dbnsfp"]).set_index("Input")
print(dbnsfp.shape)
dbnsfp[dbnsfp=="-"] = np.nan
dbnsfp = dbnsfp.dropna(how="all", axis=1)
print(dbnsfp.shape)

#dbnsfp["Ensembl_transcriptid"].unique()

mseqdr = pd.DataFrame(var_ann["mseqdr"]).set_index("Input")
print(mseqdr.shape)
mseqdr[mseqdr=="-"] = np.nan
mseqdr = mseqdr.dropna(how="all", axis=1)
print(mseqdr.shape)


general= pd.DataFrame(var_ann["general"]).set_index("Input")
general
{x:general[x].unique() for x in general.columns}
print(general.shape)
general[general=="-"] = np.nan
general = general.dropna(how="all", axis=1)
print(general.shape)

In [None]:
var_anno_df = pd.concat([general, population, dbnsfp, mseqdr], axis=1, verify_integrity=True)
var_anno_df

In [None]:
pos = var_anno_df.groupby("Pos").size()>1
var_anno_df.loc[var_anno_df["Pos"].isin(pos[pos].index)].sort_values("Pos")

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
drop_cols = []
for x in var_anno_df.columns:
    if not len(var_anno_df[x].dropna().unique())==1:
        f=plt.figure()
        sns.countplot(data=var_anno_df,x=x)
        plt.xticks(rotation=90)
    else:
        drop_cols.append(x)

In [None]:
var_anno_df = var_anno_df.drop(drop_cols, axis=1)
var_anno_df.index = variants.index
pd.concat([variants, var_anno_df],axis=1).to_csv(anno_out_f, sep="\t", index=True)
#var_anno_df.to_csv(anno_out_f, sep="\t", index=True)