In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('MOB_types_reps_diamond.tsv', sep='\t', header=None)

df.head()

Unnamed: 0,0,1,2
0,2088090015|GPICI_8721071_9,3300054113|Ga0495562_0000542_8,68.4
1,2088090015|GPICI_8721071_9,3300036799|Ga0376673_0005029_2,64.2
2,2088090015|GPICI_8721071_9,3300036818|Ga0376688_0025672_2,63.3
3,2088090015|GPICI_8721071_9,3300036835|Ga0376027_0041501_1,62.4
4,2088090015|GPICI_8721071_9,3300049571|Ga0501034_0007545_2,51.8


In [4]:
df.columns = ['node1', 'node2', 'weight']

# Remove duplicate edges
# Consider an edge (A, B) and its reverse (B, A) as duplicates
# We'll sort the node pairs to ensure consistency and then drop duplicates

# Create a new DataFrame with sorted node pairs
df_sorted = pd.DataFrame({
    'node1': df[['node1', 'node2']].min(axis=1),
    'node2': df[['node1', 'node2']].max(axis=1),
    'weight': df['weight']
})

df_sorted.head()

Unnamed: 0,node1,node2,weight
0,2088090015|GPICI_8721071_9,3300054113|Ga0495562_0000542_8,68.4
1,2088090015|GPICI_8721071_9,3300036799|Ga0376673_0005029_2,64.2
2,2088090015|GPICI_8721071_9,3300036818|Ga0376688_0025672_2,63.3
3,2088090015|GPICI_8721071_9,3300036835|Ga0376027_0041501_1,62.4
4,2088090015|GPICI_8721071_9,3300049571|Ga0501034_0007545_2,51.8


In [5]:
df["pairs"] = df.apply(lambda row: ",".join(sorted([row["node1"], row["node2"]])), axis=1)

df.head()

Unnamed: 0,node1,node2,weight,pairs
0,2088090015|GPICI_8721071_9,3300054113|Ga0495562_0000542_8,68.4,"2088090015|GPICI_8721071_9,3300054113|Ga049556..."
1,2088090015|GPICI_8721071_9,3300036799|Ga0376673_0005029_2,64.2,"2088090015|GPICI_8721071_9,3300036799|Ga037667..."
2,2088090015|GPICI_8721071_9,3300036818|Ga0376688_0025672_2,63.3,"2088090015|GPICI_8721071_9,3300036818|Ga037668..."
3,2088090015|GPICI_8721071_9,3300036835|Ga0376027_0041501_1,62.4,"2088090015|GPICI_8721071_9,3300036835|Ga037602..."
4,2088090015|GPICI_8721071_9,3300049571|Ga0501034_0007545_2,51.8,"2088090015|GPICI_8721071_9,3300049571|Ga050103..."


In [6]:
agg_df = pd.DataFrame(df.groupby('pairs')['weight'].mean()).reset_index()

agg_df = agg_df['pairs'].str.split(',', expand=True).join(agg_df['weight'])

In [7]:
#agg_df.to_csv('MOB_types_reps_diamond_undup.csv', index=False)

In [8]:
#Get env info

In [9]:
df_env = pd.read_csv("../../env_corr/taxon_countries.tsv", sep="\t")
df_env['taxon_oid'] = df_env['taxon_oid'].astype(str)

df_env.head()

Unnamed: 0,taxon_oid,Ecosystem Subtype,Latitude,Longitude,Origin,Isolation Country,Ecosystem Subtype Custom,Plasmid Count,soil_class,bdod (cg/cm³),...,soc (dg/kg),PTU Count,geometry,index_right,ECO_NAME,WWF_REALM,RealmMHT,WWF_REALM2,WWF_MHTNUM,WWF_MHTNAM
0,3300049023,Grasslands,38.53,-121.78,Meta,USA,Grasslands,36,Luvisols,152.0,...,335.0,27.0,POINT (-121.78 38.53),761.0,Great Central Valley,,NA12,Nearctic,12.0,"Mediterranean Forests, Woodlands and Scrub"
1,3300012840,Grasslands,43.07,-89.4,Meta,USA,Grasslands,21,Luvisols,,...,,18.0,POINT (-89.4 43.07),187.0,Prairie-Forest Border,,NA4,Nearctic,4.0,Temperate Broadleaf and Mixed Forests
2,3300039503,Unclassified,63.88,-149.23,Meta,USA,Unclassified,3,Cambisols,60.0,...,2409.0,2.0,POINT (-149.23 63.88),734.0,Alaska Range,,NA6,Nearctic,6.0,Boreal Forests/Taiga
3,3300042005,Rhizosphere,41.2,-97.94,Meta,USA,Rhizosphere,6,Kastanozems,135.0,...,301.0,6.0,POINT (-97.94 41.2),747.0,Central Mixed-Grass Prairie,,NA8,Nearctic,8.0,"Temperate Grasslands, Savannas and Shrublands"
4,3300049265,Agricultural land,38.55,-121.87,Meta,USA,Agricultural land,1,Vertisols,158.0,...,274.0,1.0,POINT (-121.87 38.55),761.0,Great Central Valley,,NA12,Nearctic,12.0,"Mediterranean Forests, Woodlands and Scrub"


In [140]:
taxid_mapping = dict(zip(df_env['taxon_oid'], df_env['Ecosystem Subtype Custom']))
country_mapping = dict(zip(df_env['taxon_oid'], df_env['Isolation Country']))

In [162]:
with open('countries_to_continent.tsv') as f:
    next(f)
    country_to_continent = {}
    for line in f:
        country, continent = line.strip().split("\t")
        country_to_continent[country] = continent

In [227]:
df_mob = pd.read_csv("MOB_types_derep_fixed.csv")
df_mob['taxon_oid'] = df_mob['taxon_oid'].astype(str)
df_mob['Ecosystem Subtype Custom'] = df_mob['taxon_oid'].map(taxid_mapping)
df_mob['Isolation Country'] = df_mob['taxon_oid'].map(country_mapping)
df_mob['Isolation Country'] = df_mob['Isolation Country'].replace('USA: Nebraska', 'USA').replace('USA: North Carolina', 'USA')
df_mob['Isolation Continent'] = df_mob['Isolation Country'].map(country_to_continent).fillna('Unknown')
df_mob.head()

Unnamed: 0,Protein,Plasmid,MOB_type,taxon_oid,Ecosystem Subtype Custom,Isolation Country,Isolation Continent
0,2162886007|SwRhRL2b_contig_2129928_14,2162886007|SwRhRL2b_contig_2129928,MOB_B,2162886007,Rhizosphere,USA,North America
1,2162886007|SwRhRL2b_contig_2418591_32,2162886007|SwRhRL2b_contig_2418591,MOB_B,2162886007,Rhizosphere,USA,North America
2,3300000363|ICChiseqgaiiFebDRAFT_11478459_1,3300000363|ICChiseqgaiiFebDRAFT_11478459,MOB_B,3300000363,Grasslands,USA,North America
3,3300001979|JGI24740J21852_10002113_7,3300001979|JGI24740J21852_10002113,MOB_B,3300001979,Rhizosphere,USA,North America
4,3300001989|JGI24739J22299_10001307_5,3300001989|JGI24739J22299_10001307,MOB_B,3300001989,Rhizosphere,USA,North America


In [228]:
df_mob[df_mob['Protein']=='3300054973|Ga0495425_000049_52']

Unnamed: 0,Protein,Plasmid,MOB_type,taxon_oid,Ecosystem Subtype Custom,Isolation Country,Isolation Continent
11332,3300054973|Ga0495425_000049_52,3300054973|Ga0495425_000049,MOB_F,3300054973,Peat,Sweden,Europe


In [229]:
df_hosts = pd.read_csv('plasmid_host.tsv', sep='\t')

In [230]:
top_host_class = df_hosts['host_class'].value_counts().head()
df_hosts['host_class_custom'] = df_hosts['host_class'].apply(lambda x: x if x in top_host_class else 'Other')

In [231]:
df_hosts

Unnamed: 0,Plasmid,host_class,host_order,host_class_custom
0,2088090014|GPIPI_16557027,c__Alphaproteobacteria,o__Rhizobiales,c__Alphaproteobacteria
1,2088090014|GPIPI_16944593,c__Vicinamibacteria,o__Vicinamibacterales,Other
2,2088090015|GPICI_8721071,c__Thermoleophilia,o__Gaiellales,Other
3,2088090015|GPICI_8721071,c__Thermoleophilia,o__Solirubrobacterales,Other
4,2088090015|GPICI_8946463,c__Thermoleophilia,o__Gaiellales,Other
...,...,...,...,...
61212,Refsoil_NC_015221.1,c__Gammaproteobacteria,o__Burkholderiales,c__Gammaproteobacteria
61213,Refsoil_NC_020208.1,c__Bacilli,o__Lactobacillales,c__Bacilli
61214,Refsoil_NZ_CP006987.1,c__Alphaproteobacteria,o__Rhizobiales,c__Alphaproteobacteria
61215,Refsoil_NZ_CP006988.1,c__Alphaproteobacteria,o__Rhizobiales,c__Alphaproteobacteria


In [234]:
df_mob = df_mob.join(df_hosts.set_index('Plasmid')['host_class_custom'], on='Plasmid').drop_duplicates().fillna('Unknown')

In [237]:
df_mob.to_csv('MOB_types_final.csv', index=False)