# Group genomes by clade

The genome-wise classification results are summarized applying the following criteria:

1) Columns "Accession ID", "Isolate ID" and "Clade" are deleted.

2) Average in each clade is computed for columns "Contamination" and "Completeness".

3) Most representative value within each clade is represented for columns "GTDB genus", "GTDB species", "Proposed genus" and "Proposed species"

4) All values within each calde are gathered for columns "ANI species", "ConSpeciFix species" and "PopCOGenT species"

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read table
df = pd.read_csv('Genomic_classification_summary.tsv', sep = '\t', encoding = 'latin1')
df.head()

Unnamed: 0,Accession ID,Isolate ID,Clade,Completeness,Contamination,ANI specie,ConSpeciFix specie,PopCOGenT specie,GTDB genus,GTDB specie,Proposed genus,Proposed specie
0,GCA_902555435.1,AG-390-C17,Ia.3.VI,932,0,Unk,"[1, 2, 3]",117,Pelagibacter,Pelagibacter sp902555435,Ampluspelagibacter,kiloensis
1,GCA_XXX,HIMB1430,Ia.3.VI,976,5,1,1,13,Pelagibacter,Pelagibacter sp902520475,Ampluspelagibacter,kiloensis
2,GCA_XXX,HIMB1485,Ia.3.VI,100,14,1,1,36,Pelagibacter,Pelagibacter sp902520475,Ampluspelagibacter,kiloensis
3,GCA_XXX,HIMB1488,Ia.3.VI,100,19,1,1,67,Pelagibacter,Pelagibacter sp902520475,Ampluspelagibacter,kiloensis
4,GCA_XXX,HIMB1490,Ia.3.VI,962,2,1,1,19,Pelagibacter,Pelagibacter sp902520475,Ampluspelagibacter,kiloensis


In [4]:
df["GTDB specie"] = df["GTDB specie"].str.split(" ").str[1]

In [6]:
# Count lenght of each clade
df.groupby("Clade").size()

Clade
II          24
III          7
Ia.1.I       4
Ia.3.I       2
Ia.3.II      6
Ia.3.III     4
Ia.3.IV      2
Ia.3.V      22
Ia.3.VI     47
Ia.3.VII    20
Ia.4.II     20
Ia.4.N1      1
Ia.4.N2      3
Ia.4.N4      2
Ia.4.N5     13
Ia.4.N6      4
Ia.4.N8      6
Ib.1         5
Ib.1.III    20
Ib.2.I       5
Ib.2.N7      5
Ib.4.1       1
Ib.4.N8      5
Ib.4.N9      4
Ib.N10       3
Ib.N11       3
Ic           1
dtype: int64

In [6]:
# Function to obtain the mode for each column
def calc_mode(x):
    m = x.mode()
    return m.iloc[0] if not m.empty else np.nan

In [9]:
# Split columns according to procedure
mean_cols = ["Completeness", "Contamination"]

set_cols = ["ANI species","PopCOGenT species","ConSpeciFix species"]

categorical_cols = [
    c for c in df.columns
    if c not in mean_cols + set_cols + ["Accession ID", "Isolate ID", "Clade", "ANI species","PopCOGenT species","ConSpeciFix species"]
]


In [10]:
def unique_list(x):
    return sorted(set(x.dropna()))

In [11]:
agg_dict = {}

# Unique values list
agg_dict.update({col: unique_list for col in set_cols})

# Mode
agg_dict.update({col: calc_mode for col in categorical_cols})

In [12]:
# Grouping
summary2 = (
    df
    .drop(columns=["Accession ID", "Isolate ID", "Contamination"])
    .groupby("Clade", as_index=False)
    .agg(agg_dict)
)

In [13]:
summary2.head()

summary2.to_csv('Classification_byclade_uniques.tsv', sep = '\t', index = False)