In [2]:
import polars as pl

In [3]:
def get_similar_plasmids(dataframe):
    df = pl.read_csv(dataframe, separator="\t", has_header=False).with_columns(
        pl.when(pl.col("column_10") < pl.col("column_11"))
        .then(pl.col("column_4") / pl.col("column_10"))
        .otherwise(pl.col("column_4") / pl.col("column_11"))
        .alias("AF")
    )

    return df.filter(pl.col("AF") > 0.7).select(pl.col("column_1")).unique()

In [4]:
similar_refsoil = get_similar_plasmids("novelty_results_blast_refsoil.tsv").shape[0]
soil_plasmids_rescued = 129127
similar_plsdb = get_similar_plasmids("novelty_results_blast_plsdb.tsv").shape[0]


print(
    f"There are {similar_refsoil} plasmids similar to the refsoil dataset with an AF > 0.7. This means that {1 - (similar_refsoil/soil_plasmids_rescued):.2f} of our plasmids are novel."
)

print(
    f"There are {similar_plsdb} plasmids similar to the plsdb dataset with an AF > 0.7. This means that {1 - (similar_plsdb/soil_plasmids_rescued):.2f} of our plasmids are novel."
)

There are 20246 plasmids similar to the refsoil dataset with an AF > 0.7. This means that 0.84 of our plasmids are novel.
There are 3235 plasmids similar to the plsdb dataset with an AF > 0.7. This means that 0.97 of our plasmids are novel.


In [2]:
df_plsdb = pl.read_csv(
    "novelty_results_blast_plsdb.tsv", separator="\t", has_header=False
).with_columns(
    pl.when(pl.col("column_10") < pl.col("column_11"))
    .then(pl.col("column_4") / pl.col("column_10"))
    .otherwise(pl.col("column_4") / pl.col("column_11"))
    .alias("AF")
)

df_plsdb.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,AF
str,str,f64,i64,i64,i64,i64,i64,f64,i64,i64,f64
"""2088090014|GPI…","""NZ_LR134451.1""",100.0,30,4187,4216,216580,216609,0.000321,5584,415783,0.005372
"""2088090015|GPI…","""NZ_CP086107.1""",100.0,29,9810,9838,444246,444274,0.002,9983,668738,0.002905
"""2088090015|GPI…","""NZ_CP086107.1""",100.0,29,9810,9838,598734,598762,0.002,9983,668738,0.002905
"""2088090015|GPI…","""CP040041.1""",100.0,29,24910,24938,348230,348258,0.005,25248,561419,0.001149
"""2088090015|GPI…","""CP040041.1""",100.0,29,24910,24938,350378,350406,0.005,25248,561419,0.001149


In [3]:
df_plsdb.filter(pl.col("AF") <= 0.7).select(pl.col("column_1")).unique()

column_1
str
"""3300005289|Ga0…"
"""MGYA00645605|E…"
"""3300046491|Ga0…"
"""3300049581|Ga0…"
"""3300046523|Ga0…"
"""3300046520|Ga0…"
"""3300050031|Ga0…"
"""3300049003|Ga0…"
"""3300044739|Ga0…"
"""3300044814|Ga0…"


In [4]:
df = pl.read_csv(
    "novelty_results_blast.tsv", separator="\t", has_header=False
).with_columns(
    pl.when(pl.col("column_10") < pl.col("column_11"))
    .then(pl.col("column_4") / pl.col("column_10"))
    .otherwise(pl.col("column_4") / pl.col("column_11"))
    .alias("AF")
)
df.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,AF
str,str,f64,i64,i64,i64,i64,i64,f64,i64,i64,f64
"""2088090014|GPI…","""NZ_LR134451.1""",100.0,30,4187,4216,216580,216609,0.001,5584,415783,0.005372
"""2088090014|GPI…","""IMGPR_plasmid_…",84.821,112,6238,6346,127,18,4.93e-19,8233,99508,0.013604
"""2088090014|GPI…","""IMGPR_plasmid_…",93.056,72,6234,6305,31354,31425,1.77e-18,8233,54171,0.008745
"""2088090014|GPI…","""IMGPR_plasmid_…",93.056,72,6234,6305,20398,20327,1.77e-18,8233,51491,0.008745
"""2088090014|GPI…","""IMGPR_plasmid_…",100.0,56,6230,6285,12140,12195,6.37e-18,8233,57820,0.006802


In [6]:
df_filtered_70 = df.filter(
    (pl.col("column_3") >= 70) & (pl.col("AF") >= 0.7)
).with_columns(
    pl.col("column_2")
    .map_elements(lambda x: "IMG" if "IMGPR" in x else "PLSDB")
    .alias("Origin")
)
df_filtered_70.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,AF,Origin
str,str,f64,i64,i64,i64,i64,i64,f64,i64,i64,f64,str
"""2088090014|GPI…","""IMGPR_plasmid_…",77.75,4346,725,5016,24950,20640,0.0,5097,33391,0.852658,"""IMG"""
"""2088090015|GPI…","""IMGPR_plasmid_…",100.0,10776,68,10843,10776,1,0.0,10843,25248,0.993821,"""IMG"""
"""2088090015|GPI…","""IMGPR_plasmid_…",100.0,9667,1,9667,9667,1,0.0,9983,9738,0.992709,"""IMG"""
"""2088090015|GPI…","""IMGPR_plasmid_…",100.0,9667,1,9667,97,9763,0.0,9983,9763,0.990167,"""IMG"""
"""2088090015|GPI…","""IMGPR_plasmid_…",100.0,9763,1,9763,1,9763,0.0,9763,9763,1.0,"""IMG"""


In [7]:
df_filtered_70.filter(pl.col("Origin") == "PLSDB").select(pl.col("column_1")).unique()

column_1
str
"""3300041407|Ga0…"
"""3300047472|Ga0…"
"""3300025728|Ga0…"
"""3300009011|Ga0…"
"""3300041411|Ga0…"
"""3300049823|Ga0…"
"""3300049571|Ga0…"
"""3300011119|Ga0…"
"""3300050221|Ga0…"
"""3300025728|Ga0…"


In [8]:
genomad_plasmids = 129_127

In [9]:
print(
    f'From the {genomad_plasmids} plasmids, there are {genomad_plasmids - df_filtered_70.select(pl.col("column_1")).unique().shape[0]} "novel" plasmids when considering IMGPR + PLSDB'
)

print(
    f'From the {genomad_plasmids} plasmids, there are {genomad_plasmids - df_filtered_70.filter(pl.col("Origin")=="PLSDB").select(pl.col("column_1")).unique().shape[0]} "novel" plasmids when considering PLSDB'
)

print(
    f'From the {genomad_plasmids} plasmids, there are {genomad_plasmids - df_filtered_70.filter(pl.col("Origin")=="IMG").select(pl.col("column_1")).unique().shape[0]} "novel" plasmids when considering IMGPR'
)

From the 129127 plasmids, there are 59609 "novel" plasmids when considering IMGPR + PLSDB
From the 129127 plasmids, there are 123626 "novel" plasmids when considering PLSDB
From the 129127 plasmids, there are 59821 "novel" plasmids when considering IMGPR


In [17]:
def complete_unique(filter, which_data):
    with open("complete.txt") as f:
        complete = [line.strip() for line in f]
    num = 0
    for series in df_filtered_70.filter(filter).select("column_1").unique():
        for contig in series.to_list():
            if contig in complete:
                num += 1

    print(
        f"From the {len(complete)} DTR plasmids, {len(complete) - num} are 'novel' considering {which_data}"
    )


complete_unique(pl.col("AF") >= 0.7, "IMGPR + PLSDB")
complete_unique((pl.col("AF") >= 0.7) & (pl.col("Origin") == "PLSDB"), "PLSDB")
complete_unique((pl.col("AF") >= 0.7) & (pl.col("Origin") == "IMG"), "IMGPR")

From the 2614 DTR plasmids, 1592 are 'novel' considering IMGPR + PLSDB
From the 2614 DTR plasmids, 2566 are 'novel' considering PLSDB
From the 2614 DTR plasmids, 1594 are 'novel' considering IMGPR


In [11]:
df

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,AF
str,str,f64,i64,i64,i64,i64,i64,f64,i64,i64,f64
"""2088090014|GPI…","""NZ_LR134451.1""",100.0,30,4187,4216,216580,216609,0.001,5584,415783,0.005372
"""2088090014|GPI…","""IMGPR_plasmid_…",84.821,112,6238,6346,127,18,4.9300e-19,8233,99508,0.013604
"""2088090014|GPI…","""IMGPR_plasmid_…",93.056,72,6234,6305,31354,31425,1.7700e-18,8233,54171,0.008745
"""2088090014|GPI…","""IMGPR_plasmid_…",93.056,72,6234,6305,20398,20327,1.7700e-18,8233,51491,0.008745
"""2088090014|GPI…","""IMGPR_plasmid_…",100.0,56,6230,6285,12140,12195,6.3700e-18,8233,57820,0.006802
"""2088090014|GPI…","""IMGPR_plasmid_…",89.024,82,6230,6310,20048,19968,8.2400e-17,8233,20080,0.00996
"""2088090014|GPI…","""IMGPR_plasmid_…",91.667,72,6234,6305,19004,18933,8.2400e-17,8233,54390,0.008745
"""2088090014|GPI…","""IMGPR_plasmid_…",98.214,56,6230,6285,10553,10498,2.9700e-16,8233,17933,0.006802
"""2088090014|GPI…","""IMGPR_plasmid_…",98.214,56,6230,6285,1222,1167,2.9700e-16,8233,28560,0.006802
"""2088090014|GPI…","""IMGPR_plasmid_…",100.0,48,6238,6285,8840,8887,1.7800e-13,8233,9359,0.00583


In [18]:
with open("non_novel_plasmids.txt", "w") as f:
    for i in df_filtered_70["column_1"].unique():
        f.write(i + "\n")