In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("IMG_isolates.tsv", sep="\t", index_col=0)

df.head()

Unnamed: 0_level_0,Domain,Sequencing Status,Study Name,Genome Name / Sample Name,Sequencing Center,IMG Genome ID,Is Public,NCBI Assembly Accession,NCBI Bioproject Accession,NCBI Biosample Accession,...,Ecosystem Type,Specific Ecosystem,Geographic Location,Isolation Country,Latitude,Longitude,Genome Size * assembled,Gene Count * assembled,Predicted Plasmids,Unnamed: 24
taxon_oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
643348580,Archaea,Finished,Thermococcus onnurineus NA1,Thermococcus onnurineus NA1,Korea Institute of Ocean Science and Technolog...,643348580,Yes,GCA_000018365,PRJNA20773,SAMN02603476,...,Marine,Sediment,PACMANUS hydrothermal vent sediment at a depth...,Papua New Guinea,,,1847607,2027,0,
2504557010,Archaea,Permanent Draft,YNP Site 4 Thermoproteales Type I,Thermoproteales Type I,,2504557010,Yes,,,,...,Thermal springs,Unclassified,,,,,3908994,4730,0,
2693429869,Archaea,Permanent Draft,Genomic Encyclopedia of Archaeal and Bacterial...,Halobiforma haloterrestris DSM 13078,DOE Joint Genome Institute (JGI),2693429869,Yes,GCA_900112205.1,PRJNA331410,SAMN05444422,...,Soil,Unclassified,,,,,4495442,4405,0,
2531839487,Archaea,Permanent Draft,Halosimplex carlsbadense 2-9-1,Halosimplex carlsbadense 2-9-1,"University of California, Davis",2531839487,Yes,GCA_000337455.1,PRJNA174924,SAMN02471654,...,Geologic,Unclassified,"Carlsbad, New Mexico, USA",USA,32.42,-104.228,4694889,4505,4,
3002684235,Archaea,Permanent Draft,Insights into the genomics of deep subsurface ...,Thermococcus sp. BUBBLE BATH,Western Washington University,3002684235,Yes,GCA_012027595.1,PRJNA523072,SAMN10963467,...,Marine,Unclassified,Pacific Ocean,Pacific Ocean,,,2048798,2419,0,


In [3]:
df.shape

(102736, 24)

In [4]:
terms = "Soil|Rhizosphere"
df = df[
    (df["Ecosystem Subtype"].str.contains(terms, case=False))
    | (df["Geographic Location"].str.contains(terms, case=False, na=False))
    | (df["Specific Ecosystem"].str.contains(terms, case=False, na=False))
    | (df["Study Name"].str.contains(terms, case=False, na=False))
    | (df["Ecosystem"].str.contains(terms, case=False, na=False))
    | (df["Ecosystem Category"].str.contains(terms, case=False, na=False))
    | (df["Ecosystem Type"].str.contains(terms, case=False, na=False))
]
df.shape

(8813, 24)

In [5]:
# my own exclusion terms start after Agave (Glacier)
exclusion_terms = "Rock-dwelling|Deep subsurface|Plant litter|Geologic|Oil reservoir|Volcanic|Contaminated|Wetlands|Aquifer|Tar|Sediment|Fracking Water|Soil crust|wetland|sediment|acid mine|cave wall surface|mine tailings|rock biofilm|beach sand|Petroleum|Stalagmite|Subsurface hydrocarbon microbial communities|Vadose zone|mud volcano|Fumarolic|enriched|Composted filter cake|Ice psychrophilic|oil sands|groundwater|Contaminated|rock biofilm|Deep mine|coal mine fire|Hydrocarbon resource environments|Marine|enrichment|groundwater|mangrove|saline desert|Hydroxyproline|Rifle|coastal|compost|biocrust|crust|Creosote|soil warming|Testing DNA extraction|Agave|Glacier|permafrost|oil reservoir|winter snow cover|seasonal snow patch|Interdital zone|Aquatic|Arthropoda|Phyllosphere|Nest|Phylloplane|Phyllosphere|Endosphere|Fossil|Engineered|Rhizoplane|Digestive system|Activated Sludge|Cave|Animal waste|Tissue|Industrial wastewater|Sludge|Nodule|Biogas|Built environment|Mammals|Cnidaria|Built environment|Fungi|Solid waste|Bioreactor|Wastewater|Air|Food production|Microbial"

In [6]:
df = df[
    (~df["Ecosystem Type"].str.contains(exclusion_terms, case=False, na=False))
    & (~df["Ecosystem Category"].str.contains(exclusion_terms, case=False, na=False))
    & (~df["Ecosystem Subtype"].str.contains(exclusion_terms, case=False, na=False))
    & (~df["Specific Ecosystem"].str.contains(exclusion_terms, case=False, na=False))
    & (~df["Geographic Location"].str.contains(exclusion_terms, case=False, na=False))
    & (~df["Study Name"].str.contains(exclusion_terms, case=False))
]

In [7]:
df.shape

(6909, 24)

In [8]:
print(
    df["Specific Ecosystem"].unique(),
    df["Ecosystem Subtype"].unique(),
    df["Ecosystem Category"].unique(),
    df["Ecosystem Type"].unique(),
)

['Unclassified' nan 'Agricultural land' 'Soil' 'Agricultural soil'
 'Desert' 'Farm' 'Grasslands' 'Bulk soil' 'Creek' 'Mineral soil' 'Bog'
 'Humus' 'Forest' 'Forest soil' 'Orchard soil' 'Mine drainage' 'Pasture'
 'Mine'] ['Unclassified' nan 'Garden' 'Salt flat/Salt pan' 'Rhizosphere' 'Sand'
 'Desert' 'Greenhouse' 'Paddy field/soil' 'Loam' 'Forest' 'Clay'
 'Temperate forest' 'Peat' 'Tropical forest' 'Boreal forest/Taiga'
 'Agricultural land' 'Manure-fertilized' 'Geothermal field' 'Tundra'
 'Riverside' 'Silt' 'Grasslands' 'Pasture' 'Organic layer' 'Arable'] ['Terrestrial' nan 'Plants' 'Unclassified'] ['Soil' nan 'Roots' 'Unclassified' 'Agricultural field' 'Desert']


In [9]:
df

Unnamed: 0_level_0,Domain,Sequencing Status,Study Name,Genome Name / Sample Name,Sequencing Center,IMG Genome ID,Is Public,NCBI Assembly Accession,NCBI Bioproject Accession,NCBI Biosample Accession,...,Ecosystem Type,Specific Ecosystem,Geographic Location,Isolation Country,Latitude,Longitude,Genome Size * assembled,Gene Count * assembled,Predicted Plasmids,Unnamed: 24
taxon_oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2693429869,Archaea,Permanent Draft,Genomic Encyclopedia of Archaeal and Bacterial...,Halobiforma haloterrestris DSM 13078,DOE Joint Genome Institute (JGI),2693429869,Yes,GCA_900112205.1,PRJNA331410,SAMN05444422,...,Soil,Unclassified,,,,,4495442,4405,0,
2690315681,Archaea,Permanent Draft,Haloarcula sp. strain K1 genome sequencing,Haloarcula sp. K1,Institute of Microbial Technology,2690315681,Yes,GCA_001647155.1,PRJNA308270,SAMN04390187,...,Soil,Unclassified,"India: Tamilnadu, Kanyakumari",India,,,4331797,4443,3,
2617270902,Archaea,Permanent Draft,"Genomic Encyclopedia of Type Strains, Phase II...",Halogranum gelatinilyticum CGMCC 1.10119,DOE Joint Genome Institute (JGI),2617270902,Yes,GCA_900103715.1,PRJNA303422,SAMN04487949,...,,,,,,,3770187,3856,2,
2994846370,Archaea,Permanent Draft,Halostella limicola LT12 genome sequencing,Halostella limicola LT12,Jiangsu University,2994846370,Yes,GCA_003675875.1,PRJNA494990,SAMN10187727,...,Soil,Unclassified,China: XinJiang,China,,,4117014,4286,0,
2512564055,Archaea,Finished,Methanocella conradii HZ254,Methanocella conradii HZ254,China Agricultural University,2512564055,Yes,GCA_000251105.1,PRJNA73221,SAMN02603117,...,Soil,Agricultural land,An experimental farm at China National Rice Re...,China,30.0769,119.9103,2378438,2512,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2866325816,Bacteria,Permanent Draft,Micromonospora inaquosa LB39 Genome sequencing,Micromonospora inaquosa LB39,Newcastle University,2866325816,Yes,GCA_003857055.1,PRJNA472949,SAMN09254133,...,Soil,Desert,Chile: Lomas Bayas,,,,7748575,7399,1,
2643221851,Bacteria,Permanent Draft,Genome sequencing of Arabidopsis leaf and root...,Arthrobacter sp. Soil782,ETH Zurich,2643221851,Yes,GCA_001428435.1,PRJNA298127,SAMN04155675,...,Soil,Unclassified,Germany:Cologne,,50.9580,6.8560,3422365,3307,0,
2828427071,Bacteria,Permanent Draft,"Genomic Encyclopedia of Type Strains, Phase IV...",Sphingobium xanthum DSM 100901,DOE Joint Genome Institute (JGI),2828427071,Yes,,PRJNA546792,SAMN12025086,...,Soil,Unclassified,,,,,4060606,3990,0,
2619619027,Bacteria,Permanent Draft,Genomic Encyclopedia of Archaeal and Bacterial...,Actinopolyspora mzabensis DSM 45460,DOE Joint Genome Institute (JGI),2619619027,Yes,GCA_900101095.1,PRJNA303287,SAMN04487820,...,Soil,Unclassified,,,,,5004253,4512,0,


In [12]:
with open("soil_isolates_oids.txt", "w") as f:
    for i in df.index:
        f.write(f"_{i}_")  # the '_' is for getting the exact scaffold from IMGPR
        f.write("\n")