In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import geopandas as gpd
from upsetplot import UpSet
from upsetplot import from_indicators, from_memberships
from matplotlib_venn import venn2

sns.set_style("whitegrid", {"grid.linestyle": "dotted"})
from collections import defaultdict

In [33]:
# with open("../ptu/all_rescued_ids.txt") as f:
#     rescued_plasmids = [i.strip() for i in f]

# with open("complete_plasmids.txt") as f:
#     complete_meta = [i.strip() for i in f]

# with open("../ptu/soil_isolates_complete.txt") as f:
#     complete_isolates = [i.strip().split("\t")[0] for i in f]

In [34]:
with open("../novelty/all_soil_plasmids_final_ids.txt") as f:
    plasmid_count = defaultdict(int)
    plasmid_oids = []
    isolate_oids = []
    for i in f:
        if i.startswith("IMGPR"):
            plasmid_oids.append(i.strip().split("_")[2])
            isolate_oids.append(i.strip().split("_")[2])
        elif i.startswith("PLSDB") or i.startswith("Refsoil"):
            plasmid_oids.append(i.strip())
            isolate_oids.append(i.strip())
        else:
            plasmid_oids.append(i.strip().split("|")[0])

for i in plasmid_oids:
    if i.startswith("IMGPR"):
        plasmid_count[i.split("_")[2]] += 1
    elif i.startswith("PLSDB") or i.startswith("Refsoil"):
        plasmid_count[i.strip()] += 1
    else:
        plasmid_count[i.split("|")[0]] += 1

In [35]:
print(
    f" There are {sum(plasmid_count.values())} total plasmids, from {len(set(plasmid_oids))} oids, where {len(set(isolate_oids))} are isolates and {len(set(plasmid_oids)) - len(set(isolate_oids))} are from meta studies"
)

 There are 107538 total plasmids, from 7185 oids, where 3875 are isolates and 3310 are from meta studies


In [36]:
df_img_final = pd.read_csv("../exportdata.tsv", sep="\t", index_col=0)
df_img_final.index = df_img_final.index.astype("str")
for i in df_img_final.index:
    if i in isolate_oids:
        df_img_final.loc[i, "Origin"] = "Isolate"
    else:
        df_img_final.loc[i, "Origin"] = "Meta"

df_mgnify = pd.read_csv("../env_corr/MGnify_soil_final_metadata.csv")
df_mgnify = df_mgnify[df_mgnify["id_analyses"].isin(plasmid_count.keys())]
df_mgnify["Isolation Country"] = df_mgnify["attributes.geo-loc-name"].apply(
    lambda x: x.split(";")[0] if isinstance(x, str) else x
)
df_mgnify["Origin"] = "Meta"

In [37]:
df_mgnify_for_map = df_mgnify[
    [
        "attributes.accession_analyses",
        "relationships.biome.data.id",
        "attributes.longitude",
        "attributes.latitude",
        "Origin",
        "Isolation Country",
    ]
]

df_img_for_map = df_img_final.reset_index()[
    [
        "taxon_oid",
        "Ecosystem Subtype",
        "Specific Ecosystem",
        "Latitude",
        "Longitude",
        "Origin",
        "Isolation Country",
    ]
]

# rename mgnify columns
df_mgnify_for_map.columns = [
    "taxon_oid",
    "Ecosystem Subtype",
    "Longitude",
    "Latitude",
    "Origin",
    "Isolation Country",
]
# Get last part of ecosystem
df_mgnify_for_map["Ecosystem Subtype"] = df_mgnify_for_map["Ecosystem Subtype"].apply(
    lambda x: x.split(":")[-1]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mgnify_for_map["Ecosystem Subtype"] = df_mgnify_for_map["Ecosystem Subtype"].apply(


In [38]:
def fix_ecosystem(row):
    if (row["Ecosystem Subtype"] == "Unclassified") & (
        row["Specific Ecosystem"] != "Unclassified"
    ):
        return row["Specific Ecosystem"]
    else:
        return row["Ecosystem Subtype"]


df_img_for_map["Specific Ecosystem"] = df_img_for_map["Specific Ecosystem"].apply(
    lambda x: (
        x.replace("Tropical rainforest", "Tropical forest") if not pd.isnull(x) else x
    )
)

df_img_for_map["Ecosystem Subtype"] = df_img_for_map.apply(fix_ecosystem, axis=1)

df_img_for_map.drop("Specific Ecosystem", axis=1, inplace=True)

In [39]:
df_plsdb = pd.read_csv("../novelty/plsdb_all_soil_plasmids.tsv", sep="\t")
df_plsdb["Isolation Country"] = df_plsdb["BIOSAMPLE_Location"].apply(
    lambda x: x.split(":")[0] if isinstance(x, str) else x
)
df_plsdb = df_plsdb[
    ["NUCCORE_ACC", "loc_lat", "loc_lng", "Specific Ecosystem", "Isolation Country"]
]
df_plsdb.columns = [
    "taxon_oid",
    "Latitude",
    "Longitude",
    "Ecosystem Subtype",
    "Isolation Country",
]
df_plsdb["Origin"] = "Isolate"
df_plsdb["taxon_oid"] = df_plsdb["taxon_oid"].apply(lambda x: "PLSDB_" + x)
df_plsdb.head()

Unnamed: 0,taxon_oid,Latitude,Longitude,Ecosystem Subtype,Isolation Country,Origin
0,PLSDB_NZ_CP098484.1,-8.361371,-40.13418,Unclassified,Brazil,Isolate
1,PLSDB_NZ_CP026110.1,50.9943,3.2667,Unclassified,Belgium,Isolate
2,PLSDB_NZ_CP030938.1,29.645398,91.03431,Unclassified,China,Isolate
3,PLSDB_CP099978.1,30.202497,115.033414,Unclassified,China,Isolate
4,PLSDB_CP101282.1,30.392538,114.889899,Unclassified,China,Isolate


In [40]:
df_refsoil = pd.read_csv("../novelty/refsoil_efetch_metadata.tsv", sep="\t")
df_refsoil["taxon_oid"] = df_refsoil["taxon_oid"].apply(lambda x: "Refsoil_" + x)
df_refsoil["Latitude"] = df_refsoil["Latitude/Longitude"].apply(
    lambda x: float(x.split(" ")[0]) if x != "Unknown" else np.nan
)
df_refsoil["Longitude"] = df_refsoil["Latitude/Longitude"].apply(
    lambda x: float(x.split(" ")[2]) if x != "Unknown" else np.nan
)
df_refsoil["Isolation Country"] = df_refsoil["Isolation Country"].apply(
    lambda x: x.split(":")[0] if x != "Unknown" else np.nan
)
df_refsoil.drop("Latitude/Longitude", axis=1, inplace=True)
df_refsoil["Origin"] = "Isolate"
df_refsoil["Ecosystem Subtype Custom"] = np.nan
df_refsoil.head()

Unnamed: 0,taxon_oid,Isolation Country,Latitude,Longitude,Origin,Ecosystem Subtype Custom
0,Refsoil_NC_000959.1,,,,Isolate,
1,Refsoil_NC_000958.1,,,,Isolate,
2,Refsoil_NC_002489.3,,,,Isolate,
3,Refsoil_NC_002490.1,,,,Isolate,
4,Refsoil_NC_002679.1,,,,Isolate,


In [41]:
df_for_map = pd.concat([df_img_for_map, df_mgnify_for_map, df_plsdb, df_refsoil])

df_for_map["Ecosystem Subtype"].fillna("Unclassified", inplace=True)

# remove Rhizoplane which I don't know why are appearing again
# df_for_map = df_for_map[df_for_map["Ecosystem Subtype"] != "Rhizoplane"]

# Remove ids which are the same for refsoil and plsdb

df_for_map["original_oid"] = df_for_map["taxon_oid"].apply(
    lambda x: "_".join(x.split("_")[1:]) if "Refsoil" in x or "PLSDB" in x else x
)
df_for_map.drop_duplicates(subset="original_oid", inplace=True)
df_for_map.drop("original_oid", axis=1, inplace=True)
df_for_map.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom
0,3300049023,Grasslands,38.5326,-121.783,Meta,USA,
1,3300012840,Grasslands,43.073,-89.4011,Meta,USA,
2,3300039503,Unclassified,63.88306,-149.22556,Meta,USA,
3,3300042005,Rhizosphere,41.201,-97.9447,Meta,USA,
4,3300049265,Agricultural land,38.546389,-121.874444,Meta,USA,


In [42]:
def add_plasmid_count(row):
    if row["taxon_oid"] in plasmid_count:
        return plasmid_count[row["taxon_oid"]]
    else:
        return 0


df_for_map["Plasmid Count"] = df_for_map.apply(add_plasmid_count, axis=1)

# Cut from Desert (66 samples) down as others
df_for_map["Ecosystem Subtype Custom"] = df_for_map["Ecosystem Subtype"].apply(
    lambda x: x if df_for_map["Ecosystem Subtype"].value_counts()[x] > 170 else "Other"
)

df_for_map.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count
0,3300049023,Grasslands,38.5326,-121.783,Meta,USA,Grasslands,36
1,3300012840,Grasslands,43.073,-89.4011,Meta,USA,Grasslands,21
2,3300039503,Unclassified,63.88306,-149.22556,Meta,USA,Unclassified,3
3,3300042005,Rhizosphere,41.201,-97.9447,Meta,USA,Rhizosphere,6
4,3300049265,Agricultural land,38.546389,-121.874444,Meta,USA,Agricultural land,1


In [43]:
df_for_map["Latitude"].unique()

array([38.5326   , 43.073    , 63.88306  , ..., 37.128333 , 47.682726 ,
       32.7265177])

In [44]:
# format latitude and longitude to two two decimal places

df_for_map["Latitude"] = df_for_map["Latitude"].apply(lambda x: round(x, 2))
df_for_map["Longitude"] = df_for_map["Longitude"].apply(lambda x: round(x, 2))

In [45]:
# This table was exported to be used as input for fetching soil classes from soilgrids. Also, while running the soilgrids script, some latitudes and longitudes were fixed manually (they were inverted when the researcher added them)

# df_for_map.dropna(subset=["Latitude", "Longitude"]).to_csv(
#     "temp_df_for_map.csv", index=None
# )

df_soil_classes = pd.read_csv("../novelty/soil_grid_res.tsv", sep="\t", index_col=0)
df_soil_classes = df_soil_classes[
    df_soil_classes.index.isin(df_for_map["taxon_oid"].values)
]

df_soil_classes.head()

Unnamed: 0_level_0,soil_class
taxon_oid,Unnamed: 1_level_1
3300044459,Luvisols
3300046473,Cambisols
3300027907,Luvisols
3300052899,Cambisols
3300046483,Cambisols


In [46]:
df_for_map = df_for_map.join(df_soil_classes, on="taxon_oid")

In [47]:
def add_plasmid_count(row):
    if row["taxon_oid"] in plasmid_count:
        return plasmid_count[row["taxon_oid"]]
    else:
        return 0


df_for_map["Plasmid Count"] = df_for_map.apply(add_plasmid_count, axis=1)

# Cut from Desert (66 samples) down as others
df_for_map["Ecosystem Subtype Custom"] = df_for_map["Ecosystem Subtype"].apply(
    lambda x: x if df_for_map["Ecosystem Subtype"].value_counts()[x] > 170 else "Other"
)

df_for_map.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count,soil_class
0,3300049023,Grasslands,38.53,-121.78,Meta,USA,Grasslands,36,Luvisols
1,3300012840,Grasslands,43.07,-89.4,Meta,USA,Grasslands,21,Luvisols
2,3300039503,Unclassified,63.88,-149.23,Meta,USA,Unclassified,3,Cambisols
3,3300042005,Rhizosphere,41.2,-97.94,Meta,USA,Rhizosphere,6,Kastanozems
4,3300049265,Agricultural land,38.55,-121.87,Meta,USA,Agricultural land,1,Vertisols


In [48]:
import ast

with open("../novelty/soil_grid_results.tsv") as f:
    soil_atts_dict = {}
    for line in f:
        try:
            soil_atts_dict[line.strip().split("\t")[0]] = ast.literal_eval(
                line.strip().split("\t")[1]
            )
        except Exception:
            soil_atts_dict[line.strip().split("\t")[0]] = np.nan

df_soil_atts = pd.DataFrame(soil_atts_dict).T

df_soil_atts.head()

Unnamed: 0,bdod (cg/cm³),cec (mmol(c)/kg),cfvo (cm³/dm³),clay (g/kg),nitrogen (cg/kg),ocd (dg/dm³),phh2o (pH*10),sand (g/kg),silt (g/kg),soc (dg/kg)
3300049023,152.0,252.0,4.0,267.0,395.0,332.0,69.0,171.0,562.0,335.0
3300012840,,,,,,,,,,
3300039503,60.0,541.0,112.0,137.0,1084.0,670.0,49.0,367.0,496.0,2409.0
3300042005,135.0,182.0,3.0,163.0,299.0,377.0,63.0,594.0,242.0,301.0
3300049265,158.0,235.0,8.0,297.0,253.0,330.0,69.0,195.0,508.0,274.0


In [49]:
df_for_map = df_for_map.join(df_soil_atts, on="taxon_oid")
df_for_map.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count,soil_class,bdod (cg/cm³),cec (mmol(c)/kg),cfvo (cm³/dm³),clay (g/kg),nitrogen (cg/kg),ocd (dg/dm³),phh2o (pH*10),sand (g/kg),silt (g/kg),soc (dg/kg)
0,3300049023,Grasslands,38.53,-121.78,Meta,USA,Grasslands,36,Luvisols,152.0,252.0,4.0,267.0,395.0,332.0,69.0,171.0,562.0,335.0
1,3300012840,Grasslands,43.07,-89.4,Meta,USA,Grasslands,21,Luvisols,,,,,,,,,,
2,3300039503,Unclassified,63.88,-149.23,Meta,USA,Unclassified,3,Cambisols,60.0,541.0,112.0,137.0,1084.0,670.0,49.0,367.0,496.0,2409.0
3,3300042005,Rhizosphere,41.2,-97.94,Meta,USA,Rhizosphere,6,Kastanozems,135.0,182.0,3.0,163.0,299.0,377.0,63.0,594.0,242.0,301.0
4,3300049265,Agricultural land,38.55,-121.87,Meta,USA,Agricultural land,1,Vertisols,158.0,235.0,8.0,297.0,253.0,330.0,69.0,195.0,508.0,274.0


In [50]:
# ptus = pd.read_csv(
#     "../ptu/derep/derep_plasmids_clusters_with_ptu_ids.tsv",
#     sep="\t",
#     header=None,
# )
# ptus["Length"] = ptus[2].apply(lambda x: len(x.split(",")))
# ptus.head()

In [51]:
# oid_to_environment = pd.Series(
#     df_for_map["Ecosystem Subtype Custom"].values, index=df_for_map["taxon_oid"]
# ).to_dict()


# def get_oids(row):
#     splitted = row[2].split(",")
#     return ",".join(
#         [i.split("_")[2] if "IMGPR" in i else i.split("|")[0] for i in splitted]
#     )


# def get_environment(row):
#     oids = row["oids"].split(",")
#     environments = [oid_to_environment.get(oid, "") for oid in oids]
#     # Filter out empty strings if an oid was not found in the dictionary
#     environments = [env for env in environments if env]
#     return ",".join(environments)


# ptus["oids"] = ptus.apply(get_oids, axis=1)
# ptus["Ecosystem Subtype Custom"] = ptus.apply(get_environment, axis=1)
# ptus = ptus.set_index(0)
# ptus.head()

In [52]:
# df_for_map = df_for_map.set_index("taxon_oid")

# oid_dict = defaultdict(set)

# for ptu in ptus.index:
#     for oid in ptus.loc[ptu, "oids"].split(","):
#         oid_dict[oid].add(ptu)

In [53]:
# for oid in oid_dict:
#     if oid in df_for_map.index:
#         df_for_map.loc[oid, "PTU Count"] = len(oid_dict[oid])

# df_for_map["PTU Count"].fillna(0, inplace=True)


df_for_map["soil_class"].fillna("Unknown", inplace=True)
df_for_map["soil_class"] = df_for_map["soil_class"].apply(
    lambda x: "Unknown" if "error" in x else x
)
df_for_map.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count,soil_class,bdod (cg/cm³),cec (mmol(c)/kg),cfvo (cm³/dm³),clay (g/kg),nitrogen (cg/kg),ocd (dg/dm³),phh2o (pH*10),sand (g/kg),silt (g/kg),soc (dg/kg)
0,3300049023,Grasslands,38.53,-121.78,Meta,USA,Grasslands,36,Luvisols,152.0,252.0,4.0,267.0,395.0,332.0,69.0,171.0,562.0,335.0
1,3300012840,Grasslands,43.07,-89.4,Meta,USA,Grasslands,21,Luvisols,,,,,,,,,,
2,3300039503,Unclassified,63.88,-149.23,Meta,USA,Unclassified,3,Cambisols,60.0,541.0,112.0,137.0,1084.0,670.0,49.0,367.0,496.0,2409.0
3,3300042005,Rhizosphere,41.2,-97.94,Meta,USA,Rhizosphere,6,Kastanozems,135.0,182.0,3.0,163.0,299.0,377.0,63.0,594.0,242.0,301.0
4,3300049265,Agricultural land,38.55,-121.87,Meta,USA,Agricultural land,1,Vertisols,158.0,235.0,8.0,297.0,253.0,330.0,69.0,195.0,508.0,274.0


In [54]:
# df_for_map.to_csv("../hmmsearch_outputs_5kb/soil_plasmids_map_before_derep.tsv", sep="\t")

In [55]:
#df_for_map = df_for_map[df_for_map["PTU Count"] > 0]

In [56]:
shapefile = gpd.read_file("../terr-ecoregions-TNC/tnc_terr_ecoregions.shp")
shapefile.head()

Unnamed: 0,ECO_ID_U,ECO_CODE,ECO_NAME,ECO_NUM,ECODE_NAME,CLS_CODE,ECO_NOTES,WWF_REALM,WWF_REALM2,WWF_MHTNUM,WWF_MHTNAM,RealmMHT,ER_UPDATE,ER_DATE_U,ER_RATION,SOURCEDATA,geometry
0,10000,AA0101,Admiralty Islands Lowland Rain Forests,1,AA0101. Admiralty Islands lowland rain forests,0,,AA,Australasia,1,Tropical and Subtropical Moist Broadleaf Forests,AA1,,,,"Olson, 2001","MULTIPOLYGON (((147.28819 -2.57608, 147.27150 ..."
1,10001,AA0102,Banda Sea Islands Moist Deciduous Forests,2,AA0102. Banda Sea Islands moist deciduous forests,0,,AA,Australasia,1,Tropical and Subtropical Moist Broadleaf Forests,AA1,,,,"Olson, 2001","MULTIPOLYGON (((128.22510 -8.21748, 128.22380 ..."
2,10002,AA0103,Biak-Numfoor Rain Forests,3,AA0103. Biak-Numfoor rain forests,0,,AA,Australasia,1,Tropical and Subtropical Moist Broadleaf Forests,AA1,,,,"Olson, 2001","MULTIPOLYGON (((136.21548 -1.27113, 136.20557 ..."
3,10003,AA0104,Buru Rain Forests,4,AA0104. Buru rain forests,0,,AA,Australasia,1,Tropical and Subtropical Moist Broadleaf Forests,AA1,,,,"Olson, 2001","MULTIPOLYGON (((127.21510 -3.82641, 127.22780 ..."
4,10004,AA0105,Central Range Montane Rain Forests,5,AA0105. Central Range montane rain forests,0,,AA,Australasia,1,Tropical and Subtropical Moist Broadleaf Forests,AA1,,,,"Olson, 2001","POLYGON ((137.02867 -2.99278, 137.05413 -3.077..."


In [57]:
biomes = shapefile.groupby("WWF_MHTNAM").agg({"geometry": list})
biomes.head()

Unnamed: 0_level_0,geometry
WWF_MHTNAM,Unnamed: 1_level_1
Boreal Forests/Taiga,[MULTIPOLYGON (((107.27309998600003 53.1108099...
Deserts and Xeric Shrublands,[MULTIPOLYGON (((40.387899943000036 -22.352270...
Flooded Grasslands and Savannas,[MULTIPOLYGON (((35.26729991900004 -6.23140008...
Inland Water,[MULTIPOLYGON (((-69.69894002799992 -15.240599...
Mangroves,[MULTIPOLYGON (((149.8652000190001 -10.4888499...


In [58]:
gdf = gpd.GeoDataFrame(
    df_for_map,
    geometry=gpd.points_from_xy(df_for_map.Longitude, df_for_map.Latitude),
    crs="EPSG:4326",
)

gdf.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count,soil_class,bdod (cg/cm³),cec (mmol(c)/kg),cfvo (cm³/dm³),clay (g/kg),nitrogen (cg/kg),ocd (dg/dm³),phh2o (pH*10),sand (g/kg),silt (g/kg),soc (dg/kg),geometry
0,3300049023,Grasslands,38.53,-121.78,Meta,USA,Grasslands,36,Luvisols,152.0,252.0,4.0,267.0,395.0,332.0,69.0,171.0,562.0,335.0,POINT (-121.78000 38.53000)
1,3300012840,Grasslands,43.07,-89.4,Meta,USA,Grasslands,21,Luvisols,,,,,,,,,,,POINT (-89.40000 43.07000)
2,3300039503,Unclassified,63.88,-149.23,Meta,USA,Unclassified,3,Cambisols,60.0,541.0,112.0,137.0,1084.0,670.0,49.0,367.0,496.0,2409.0,POINT (-149.23000 63.88000)
3,3300042005,Rhizosphere,41.2,-97.94,Meta,USA,Rhizosphere,6,Kastanozems,135.0,182.0,3.0,163.0,299.0,377.0,63.0,594.0,242.0,301.0,POINT (-97.94000 41.20000)
4,3300049265,Agricultural land,38.55,-121.87,Meta,USA,Agricultural land,1,Vertisols,158.0,235.0,8.0,297.0,253.0,330.0,69.0,195.0,508.0,274.0,POINT (-121.87000 38.55000)


In [59]:
gdf = gpd.sjoin(
    gdf,
    shapefile[
        [
            "ECO_NAME",
            "WWF_REALM",
            "RealmMHT",
            "WWF_REALM2",
            "WWF_MHTNUM",
            "WWF_MHTNAM",
            "geometry",
        ]
    ],
    how="left",
    op="within",
)

  if await self.run_code(code, result, async_=asy):


In [60]:
gdf["ECO_NAME"] = gdf["ECO_NAME"].fillna("Unknown")
gdf["WWF_REALM2"] = gdf["WWF_REALM2"].fillna("Unknown")
gdf["WWF_MHTNAM"] = gdf["WWF_MHTNAM"].fillna("Unknown")

In [61]:
# filter gdf table index for the plasmid and isolate oids
gdf = gdf.set_index("taxon_oid")
gdf = gdf[gdf.index.isin(plasmid_oids + isolate_oids)]

In [62]:
gdf.reset_index().to_csv("../env_corr/taxon_countries.tsv", sep="\t", index=False)