In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

In [None]:
os.chdir('/mnt/DATA/School/2022-2023/Thesis/Scripting/Mining')

### Taxonomy

In [None]:
file = pd.ExcelFile("Overview.xlsx")
taxonomy = pd.read_excel(file, "Counts per species", usecols=[0,1,2,3,20])[:77].fillna("Unclassified")
taxonomy = taxonomy[taxonomy["Total TP counts"] > 0]
outer = taxonomy.groupby(['Family', 'Subfamily','Genus']).count()['Name']
outer_labels = outer.index.get_level_values(2)
middle = taxonomy.groupby(['Family', 'Subfamily']).count()['Name']
middle_labels = middle.index.get_level_values(1)
inner = taxonomy.groupby(['Family']).count()['Name']
inner_labels = inner.index.get_level_values(0)

In [None]:
taxonomy

In [None]:
fig, ax = plt.subplots(figsize=(12,12))

cmap1 = plt.cm.Reds
cmap2 = plt.cm.Blues
cmap3 = plt.cm.Greens
inner_colors = [cmap1(0.99), cmap2(0.99), cmap3(0.99)]
middle_colors = [*cmap1(np.linspace(0.95,0.7,5)),
                 *cmap2(np.linspace(0.95,0.7,1)),
                 *cmap3(np.linspace(0.95,0.7,1))]
outer_colors = [*cmap1(np.linspace(0.75,0.25,14)),
                *cmap2(np.linspace(0.75,0.25,1)),
                *cmap3(np.linspace(0.75,0.25,5))]

outer_data=outer.values.flatten()
outer_total=sum(outer_data)
middle_data=middle.values.flatten()
middle_total=sum(middle_data)
inner_data=inner.values.flatten()
inner_total=sum(inner_data)

patches,_,autotexts = ax.pie(outer_data, radius=1, colors=outer_colors,
       startangle=100, autopct = lambda l: '{:.0f}'.format(l*outer_total/100), pctdistance=0.95,
       labels=outer_labels, labeldistance=1.05, rotatelabels=True,
       wedgeprops=dict(width=0.1, edgecolor='k', linewidth=2),
       textprops={'size': 'x-large', 'fontweight': 'bold'})
for autotext in autotexts:
    autotext.set_color('white')

patches = ax.pie(middle_data, radius=1-0.1, colors=middle_colors,
       startangle=100,
       labels=middle_labels, labeldistance=0.6, rotatelabels=True,
       wedgeprops=dict(width=0.36, edgecolor='k', linewidth=2),
       textprops={'size': 'large', 'fontweight': 'bold', 'color': 'white'})
for i in range(len(patches[0])):
    if i in [4,6]:
        patches[0][i].set_hatch("///")
        patches[0][i].set_linewidth(4)

patches = ax.pie(inner_data, radius=1-0.46, colors=inner_colors,
       startangle=100,
       labels = inner_labels, labeldistance=0.05, rotatelabels=True,
       wedgeprops=dict(width=0.54, edgecolor='k', linewidth=2),
       textprops={'size': 'large', 'fontweight': 'bold', 'color': 'white'})
for i in range(len(patches[0])):
    if i in [2]:
        patches[0][i].set_hatch("///")
        patches[0][i].set_linewidth(4)

ax.set(aspect="equal")
plt.tight_layout()
plt.savefig("./figures_TP/Taxonomy_TP.svg")

### Characteristics of found AceT

In [None]:
mining_prot = pd.read_csv("protein_list_all_TP", sep="\t")

**Protein length**

In [None]:
length=mining_prot["Length"]

bins = 100
fig, ax = plt.subplots()
ax.hist(length, bins=np.linspace(min(length), max(length), bins))
plt.xlabel('Length [AA]', fontsize=18)
plt.ylabel('Count [-]', fontsize=18)
ax.tick_params(axis='x', labelsize=16)
ax.tick_params(axis='y', labelsize=16)
plt.yticks(range(0,16,4))
ax.minorticks_on()
plt.tight_layout()
plt.savefig("./figures_TP/Protein_length_TP.svg")

**Host distribution**

In [None]:
host_counts_raw = mining_prot.groupby("Host").count()['# Species']
host_counts = pd.DataFrame()
names = list(host_counts_raw.index)
species=['Pseudomonas aeruginosa','Pseudomonas agarici','Pseudomonas coronafaciens','Pseudomonas fluorescens',\
         'Pseudomonas plecoglossicida', 'Pseudomonas putida', 'Pseudomonas syringae','Pseudomonas tolaasii']
rem_indices = set(range(len(names)))
for s in species:
    indices = [i for i in range(len(names)) if s in names[i]]
    host_counts[s] = [host_counts_raw[indices].sum()]
    rem_indices = rem_indices.difference(indices)
host_counts['Pseudomonas unspecified'] = [host_counts_raw[rem_indices].sum()]
host_counts = host_counts.T
host_counts.columns = ['Counts']
host_counts = host_counts[host_counts['Counts'] > 0]
host_counts.index = [list(host_counts.index)[i].split(" ")[1] for i in range(len(host_counts.index))]

In [None]:
fig, ax = plt.subplots(tight_layout=True, figsize=(8,8))

cmap1 = plt.cm.Reds
cmap2 = plt.cm.Blues
colors = [*cmap2(np.linspace(0.8,0.1,7)),
          *cmap1([0.8])]

labels = host_counts.index
total = host_counts.sum()[0]
ax.pie(host_counts.values.flatten(), labels = labels, rotatelabels = False, colors = colors,
       pctdistance = 0.85, labeldistance = 1.2,
       autopct = lambda l: '{:.0f}'.format(l*total/100), startangle = -30,
       textprops={'size': 'xx-large', 'fontweight': 'bold'})
plt.tight_layout()
plt.savefig("./figures_TP/Host_TP.svg")

**Annotation**

In [None]:
annotation_raw = mining_prot.groupby("Annotation").count()["# Species"]
mapping = {
    'GNAT family N-acetyltransferase': 'GNAT family N-acetyltransferase',
    'GNAT family acetyltransferase protein': 'GNAT family N-acetyltransferase',
    'acyl-CoA N acetyltransferase': 'acyl-CoA N-acetyltransferase',
    'hypothetical protein': 'hypothetical protein',
    'ORF.29': 'hypothetical protein',
    'PHIKZ041.3': 'hypothetical protein',
    'conserved hypothetical phage protein': 'hypothetical protein',
    'hypothetical phage protein': 'hypothetical protein',
    'unknown': 'hypothetical protein',
    'internal virion protein A': '(putative) internal virion protein A',
    'putative internal virion protein A': '(putative) internal virion protein A',
    'interval virion protein A': '(putative) internal virion protein A',
    'protein inside capsid A': '(putative) internal virion protein A'
}
mining_prot["Mapping"] = mining_prot["Annotation"]
for i in mining_prot.index:
    try:
        mining_prot['Mapping'][i] = mapping[mining_prot['Mapping'][i]]
    except:
        continue
order = [1,2,6,3,4,0,5,7]
annotation = mining_prot.groupby("Mapping").count()["# Species"][order]
annotation

In [None]:
fig, ax = plt.subplots(tight_layout=True, figsize=(14,6))

cmap1 = plt.cm.Reds
cmap2 = plt.cm.Greens
colors = [*cmap2(np.linspace(0.8,0.3,4)),
          'tab:gray',
          *cmap1(np.linspace(0.8,0.3,3))]

labels = annotation.index
total = annotation.sum()
ax.pie(annotation.values.flatten(), labels = labels, explode = (0.1,0.1,0.1,0.1,0,0,0,0),
       rotatelabels = False, pctdistance = 0.85, labeldistance = 1.1, startangle = -20,
       autopct = lambda l: '{:.0f}'.format(l*total/100), colors = colors,
       textprops={'size': 'x-large', 'fontweight': 'bold'})
plt.tight_layout()
plt.savefig("./figures_TP/Annotation_TP.svg")