In [1]:
import pandas as pd
import gzip

In [15]:
def taxID_to_SciName(Path="Data/sci_names.dmp.gz"):
    with gzip.open(Path, "rt") as file:
        Name_Lines = [Entry.split("\t") for Entry in file.readlines()]

    taxID_to_SciName = {}
    for Entry in Name_Lines:
        taxID_to_SciName[Entry[0]] = Entry[3]
    
    return taxID_to_SciName

In [10]:
def Accessions_to_lineages(Path="Data/GenBank_Bacterial_taxID_lineage.tsv.gz",
                           Path2="Data/IBM_assemblies_taxonomy_metadata.tsv.gz"
                          ):
    ### Get files
    with gzip.open(Path, "rt") as file:
        GB_Lines = [Entry.split("\t") for Entry in file.readlines()]

    with gzip.open(Path2, "rt") as file:
        IBM_Lines = [Entry.split("\t") for Entry in file.readlines()]
        
    Acc_to_Lineage = {}
    Acc_to_Rank = {}

    ### GenBank
    for Line in GB_Lines[1:]:

        Accession = Line[0].split(".")[0]

        taxID = Line[1]

        taxID_lineage = Line[2].split(",")
        Rank_lineage = Line[4].split(",")


        Acc_to_Lineage[Accession] = taxID_lineage
        Acc_to_Rank[Accession] = Rank_lineage

    ### IBM
    for Line in IBM_Lines[1:]:
        Accession = Line[0]

        taxID = Line[1]

        taxID_lineage = Line[2].split(",")
        Rank_lineage = Line[4].split(",")


        Acc_to_Lineage[Accession] = taxID_lineage
        Acc_to_Rank[Accession] = Rank_lineage
    
    print(len(Acc_to_Lineage))
    return Acc_to_Lineage

In [16]:
taxID_to_SciName = taxID_to_SciName()

In [12]:
Acc_to_Lineage = Accessions_to_lineages()

425473


In [18]:
df = pd.read_table("Data/mash_dist.k21.s1000.txt.gz")
df = df.drop("#query", axis=1)

In [21]:
def Get_accessions(Headers):
    Accessions = []
    for row in Headers:
        entry = row.split("/")[-1]
        if entry.startswith("GCA"):
            entry = entry.split(".")[0]
        if entry.startswith("GCF"):         
                entry = entry.split(".")[0]
                entry = entry.replace("GCF", "GCA")
        if "-" in entry:
            entry = entry.split("-")[0]
        Accessions.append(entry)
    return Accessions
    # print(len(list(df)))    
    # print(len(Accessions))
    
Accessions = Get_accessions(list(df))

In [22]:
Lineages_dict = {}
Rank_dict = {}
for Entry in set(Accessions):
    try:
        Lineages_dict[Entry] = Acc_to_Lineage[Entry]
        Rank_dict[Entry] = Acc_to_Rank[Entry]
    except:
        Lineages_dict[Entry] = "NA"
        Rank_dict[Entry] = "NA"
#         print(Entry)


In [118]:
Sub_species = []
i=1
for Acc in Sub_accessions:
    try:
        Lineage = Lineages_dict[Acc]
        Ranks_Lineage = Rank_dict[Acc]
        Index = Ranks_Lineage.index("species")
        Sub_species.append(taxID_to_SciName[Lineage[Index]])
    except:
#         print(Lineage, Ranks_Lineage, Index)
        Sub_species.append("NA{}".format(i))
        i+=1
print(len(Sub_species))

1477


In [119]:
Sub_genus = []
i=1
for Acc in Sub_accessions:
    try:
        Lineage = Lineages_dict[Acc]
        Ranks_Lineage = Rank_dict[Acc]
        Index = Ranks_Lineage.index("genus")
        Sub_genus.append(taxID_to_SciName[Lineage[Index]])
    except:
#         print(Lineage, Ranks_Lineage, Index)
        Sub_genus.append("NA{}".format(i))
        i+=1
print(len(Sub_genus))

1477


In [120]:
Submatrix = df.iloc[Sub_positions, Sub_positions]
# Submatrix.head()

In [121]:
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import pylab
import matplotlib.colors as colors
from matplotlib import cm
import numpy as np
from adjustText import adjust_text
from sklearn import manifold 

In [122]:
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=2)
results = mds.fit(Submatrix)

In [126]:
### Create plot
coords = results.embedding_
Plot = pd.DataFrame(dict(x=coords[:, 0], y=coords[:, 1], label=Sub_accessions))
Plot["Species"] = Sub_species
Plot["Species count"] =  Plot.groupby('Species')['Species'].transform('count')
Plot["Genus"] = Sub_genus
Plot["Genus count"] =  Plot.groupby('Genus')['Genus'].transform('count')
Plot.head()

Unnamed: 0,x,y,label,Species,Species count,Genus,Genus count
0,-0.023312,0.095658,GCA_001953695,Salmonella enterica,418,Salmonella,422
1,0.152763,-0.001861,GCA_002192315,Klebsiella pneumoniae,229,Klebsiella,284
2,-0.043283,-0.088255,GCA_002945135,Escherichia coli,578,Escherichia,587
3,-0.027072,0.092449,GCA_000493295,Salmonella enterica,418,Salmonella,422
4,-0.039734,-0.104414,GCA_000743255,Escherichia coli,578,Escherichia,587


In [2]:
### Plot parameters
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size

### Plot
groups = Plot.groupby('Species')
Filtered_Plot=groups.filter(lambda x: x["Species count"].count()>6)
groups = Filtered_Plot.groupby('Species')

fig, ax = plt.subplots()
ax.margins(0.2) # Optional, just adds 5% padding to the autoscaling

colors = cm.tab10(np.linspace(0, 1, len(groups)))
alphas = np.linspace(.2, .8, len(groups))
for group, color, alpha in zip(groups, colors, alphas):
    ax.plot(group[1].x, 
            group[1].y, 
            marker='o', 
            linestyle='', 
            ms=10, 
            label=group[0], 
            c=color, 
            markeredgecolor='k', 
            alpha=(float(1)/float(len(group[1])**(.1)))
           )

lgd = pylab.legend()
# ax.legend_.remove()
ax.xscale('symlog')
ax.yscale('symlog')
plt.show()

NameError: name 'plt' is not defined