# Download: 16S rRNA sequences

Since 16S shows more promising results of identifying phylogeny of bacterial taxa, I will download a new set, but specifically look at entries under "popset" first. If there aren't enough sequences, I will expand the scope to include more sequences.

## Setup the environment

We will utilize the package [`BioPython`](https://biopython.org/wiki/Documentation) {cite}`Cock_2009` to interface with sequencial data hosted on NCBI. We will also use `pandas` {cite}`pandasdevelopmentteam2023, McKinney2010` to store metadata in a dataframe.

In [1]:
from Bio import Entrez, SeqIO
from Bio.SeqFeature import SeqFeature, Reference
from Bio.SeqRecord import SeqRecord
import os
import logging
from Bio.Entrez.Parser import DictionaryElement
from pandas import DataFrame, notna
from Bio.Entrez import HTTPError
import numpy as np
from pandas import merge
from Bio.Seq import UndefinedSequenceError
import re


logging.basicConfig(level=logging.INFO)

In [2]:
Entrez.email = "kaedeito@student.ubc.ca"
# This line sets the name of the tool that is making the queries
Entrez.tool = "download_16s.ipynb"

logger = logging.getLogger("download_16s")

Create a folder to save our fasta and genbank file.

In [3]:
dir_path = os.path.realpath("..\\..\\datasets\\full_analysis")
os.makedirs(dir_path, exist_ok=True)

## Clean and wrangle tables

We need to clean the data to make it easier to work with.
We need to load the metadata into a tabular format to make it easier to find problems, or to find interesting patterns.

In [11]:
def clean_table(df: DataFrame, granularity:int):
  """
  Clean the dataframe by removing rows and cleaning string formatting.

  :param `df`: The dataframe to clean.
  :param `granularity`: The number of characters to group the ids by.
  """
  feat_df = df.copy()
  logger.debug(feat_df.columns)

  # Split the country column into two columns
  feat_df[['country','iso_source2']] = feat_df['country'].str.split(':',expand=True)
  # Trim whitespace from the isolation_source column
  feat_df['iso_source2'] = feat_df['iso_source2'].map(lambda x: x.strip() if isinstance(x, str) else x)

  # Group the ids by the first n characters
  feat_df['group'] = feat_df['id'].str[:granularity]

  contains_coral = feat_df['isolation_source'].str.contains('coral')
  contains_sponge = feat_df['isolation_source'].str.contains('sponge')
  feat_df.loc[(contains_coral | contains_sponge) & (feat_df['host'].isnull()), 'host'] = feat_df['isolation_source']
  feat_df.loc[(contains_coral | contains_sponge), 'isolation_source'] = np.nan

  # Fill the isolation_source column with the iso_source2 column if it is null
  feat_df['isolation_source'] = feat_df['isolation_source'].fillna(feat_df['iso_source2'])

  feat_df.loc[feat_df['group'] == 'KC668', 'country'] = 'Saudi Arabia'
  feat_df.loc[feat_df['group'] == 'KC668', 'isolation_source'] = 'Southern Red Sea'

  # Clean the table by removing the rows that do not contain Endozoicomonas
  feat_df = feat_df[notna(feat_df['organism']) & (feat_df['organism'].astype(str).str.contains('Endozoicomonas') | feat_df['organism'].astype(str).str.contains('uncultured bacterium'))]

  # Clean the table by removing the rows that do not contain 16S
  feat_df = feat_df[notna(feat_df['product']) & (feat_df['product'].astype(str).str.contains('16S'))]

  # Copy the colony info from the isolation_source column
  if 'colony' in feat_df.columns:
    feat_df['strain'] = feat_df['strain'].fillna(feat_df['colony'])

  if 'BioProject' not in feat_df.columns:
    feat_df['BioProject'] = np.nan

  # Split the db_xref column into multiple columns, and use the first column as column name
  feat_df_nuc_xref = feat_df[['id', 'db_xref']].copy()
  feat_df_nuc_xref[['xref','value']] = feat_df['db_xref'].str.split(':',expand=True)

  feat_df_nuc_xref = feat_df_nuc_xref.drop_duplicates(subset=['id']).reset_index(drop=True)
  feat_df_nuc_xref = feat_df_nuc_xref.pivot(index=['id'], columns='xref', values='value')
  feat_df = merge(feat_df, feat_df_nuc_xref, on="id")

  # remove the columns that are not needed
  feat_df.drop(columns=['db_xref'], inplace=True)

  # Reset the index
  feat_df.reset_index()

  return feat_df

In [5]:
def save_file(file_name, records, rec_type):
  """
  Save the records to the file.

  `file_name`: The name of the file to save to.

  `records`: The records to save.

  `rec_type`: The type of record to save.
  """
  file_path = os.path.join(dir_path, file_name)
  with open(file_path, "w") as out_handle:
      try:
        count = SeqIO.write(records, out_handle, rec_type)
        logger.debug(f"{rec_type} {count} Saved")
      except UndefinedSequenceError as e_seq:
        logger.error(e_seq)
        logger.error(f"Failed to write {rec_type} to file {file_name}")
      except Exception as e:
        logger.error(e)
        logger.error(f"Failed to write {rec_type} to file {file_name}")

In [6]:
def table_features(count: int, records: list[SeqRecord], rec_type: str, granularity=5, debug: bool = False):
  """
  1. Extracts the features (both source and rRNA)
  2. Based on cleaned up list of features, filter records for matching ids
  3. Save the records to a file
  """
  logger.debug(f"Found {count} {rec_type}, sequences: {len(records)}")
  qualifiers: list[dict[str, str]] = []

  save_records: list[SeqRecord] = []

  for record in records:
    feats: list[SeqFeature] = record.features

    if record.id == 'KC668564.1':
      logger.debug(f"Printing for KC668564.1")
      logger.debug(record.annotations)
      for feat in feats:
        logger.debug(feat)
    [source] = filter(lambda f: f.type == "source", feats)
    if not source:
       raise Exception("No source feature")

    # Copy the qualifiers and add the id
    # merge the rRNA qualifiers to the same dict
    qual = source.qualifiers.copy()
    qual["id"] = record.id

    # Filter for the rRNA features
    rRNA_filtered = list(filter(lambda f: f.type == "rRNA", feats))
    if len(rRNA_filtered) == 0:
       logger.debug("No rRNA feature")
       continue
    elif 'contig' in record.annotations:
      logger.debug("Is a contig")
      continue
    else:
      for rRNA in rRNA_filtered:
        try:
          qual.update(rRNA.qualifiers)
        except Exception as e:
           logger.error(rRNA)
           logger.exception(e)

      reference_list: list[Reference] = record.annotations.get('references', [])
      # Get the pubmed id list from the references
      pubmed_ids = [ref.pubmed_id for ref in reference_list if ref.pubmed_id]
      qual['pubmed_ids'] = pubmed_ids

      # Split the dbxref into a dict
      for dbxref in record.dbxrefs:
        split = dbxref.split(":")
        if len(split) > 1:
          qual[split[0]] = split[1]

      # Since all feature values are a list of string, (usually of length 1), convert to string
      for key in qual:
        qual_list = qual[key]
        if isinstance(qual_list, list):
          qual[key] = "; ".join(qual_list)

      # Extract the colony info from the isolation_source
      if "Coral-associated microbial aggregate" in qual.get('isolation_source', ''):
        logger.debug(f"colony info found for {record.id}")
        iso_qual: str = qual['isolation_source']

        [res] = re.findall(r'\((.*?)\)', iso_qual)

        qual['isolation_source'] = None
        qual['colony'] = res

      if record.id == 'KC668564.1':
        logger.debug(f"Printing for KC668564.1")
        logger.debug(qual)

      # As a safe guard, check that the sequence is not empty
      if record.seq:
        save_records.append(record)

      qualifiers.append(qual)

  # Load features into a dataframe
  df1 = DataFrame(qualifiers)
  # Clean the table by replacing "None" and "nan"
  df1.replace("None", value=np.nan, inplace=True)
  df1.replace(np.NaN, value=np.nan, inplace=True)
  df1.replace("nan", value=np.nan, regex=True, inplace=True)

  # If not in debug mode, clean the table + save the files
  if debug:
    logger.debug(df1.head())
    return df1, save_records
  else:
    df2 = clean_table(df1, granularity)
    # Filter the records by the ids in the dataframe
    filtered_recs = list(filter((lambda x: x.id in df2['id'].to_list()), save_records))

    logger.info(f"Records to save: {len(filtered_recs)}/{len(save_records)}")

    # Save the records to a file
    save_file(f"{rec_type}_seq.gb", filtered_recs, "genbank")
    save_file(f"{rec_type}_seq.fasta", filtered_recs, "fasta")

    return df2, filtered_recs


## NCBI: Popset

### Search in NCBI (Popset)

As noted by the NCBI's website, their `Popset database` is a collection of related sequences that is sourced from a single population/phylogenetic/mutation/ecosystem study. 

This is a useful grouping, as we would be able to extract the locational data from the metadata and use it to create a map of the distribution of the bacteria.

In [7]:
def search_popset(additional_ids: list[str] = []):
  """
  Search for the popset database for the coral and Endozoicomonas.
  :return: The number of results and the list of ids.
  """
  try:
    term = f"((Endozoicomonas[Organism] OR Endozoicomonas[All Fields]) AND coral[All Fields]"
    logger.debug(term)
    handle = Entrez.esearch(db="popset",
      term=term,
      retmax=20,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      handle = Entrez.efetch(db="popset", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(str(http_e))
    return 0, [], []
  except Exception as e:
    logger.error(e)
    return 0, [], []

### Summary of popset

After review of literature that happened in the inspection by Nucleotide DB, I found some popset that were worth adding manually.

This includes the popset coming out of the work of {cite:p}`Speck_2012` (`"227461503"`), and the work of {cite:p}`Bayer_2013` (`"510829312"`).

In [8]:
additional_ids_pop = [
  '227461503',
  '510829312'
]

In [9]:
count_pop, idlist_pop, records_pop = search_popset(additional_ids_pop)

Metadata table created for the data out of Popset database was saved to [features_of_popset_seq.csv](../../datasets/full_analysis/features_of_popset_seq.csv).

Genbank file was saved as [popset_seq.gb](../../datasets/full_analysis/popset_seq.gb).
Fasta file was saved as [popset_seq.fasta](../../datasets/full_analysis/popset_seq.fasta).

In [12]:
# Load the metadata, and save the records to a file
feat_df_pop0, recs_pop = table_features(count_pop, records_pop, "popset")

feat_df_pop0.to_csv(os.path.join(dir_path, "features_of_popset_seq.csv"), index=False)

feat_df_pop = feat_df_pop0.copy()
col_interest = ['id', 'group', 'organism', 'strain', 'host', 'country', 'taxon', 'BioProject', 'pubmed_ids', 'isolation_source', 'product']
feat_df_pop = feat_df_pop[col_interest]
feat_df_pop.head()

INFO:download_16s:Records to save: 2640/2930


Unnamed: 0,id,group,organism,strain,host,country,taxon,BioProject,pubmed_ids,isolation_source,product
0,JX488684.2,JX488,Endozoicomonas euniceicola,EF212,Eunicea fusca (gorgonian coral),USA,1234143,,23832969,Hillsboro Ledge,16S ribosomal RNA
1,JX488685.1,JX488,Endozoicomonas gorgoniicola,PS125,Plexaura sp. (gorgonian coral),Bahamas,1234144,,23832969,Tuna Alley (inside reef),16S ribosomal RNA
2,MG725756.1,MG725,uncultured bacterium,,Pocillopora verrucosa,,77133,,29468040,,16S ribosomal RNA
3,MG725757.1,MG725,uncultured bacterium,,Pocillopora verrucosa,,77133,,29468040,,16S ribosomal RNA
4,MG725758.1,MG725,uncultured bacterium,,Pocillopora verrucosa,,77133,,29468040,,16S ribosomal RNA


### Geographic details

In [13]:
feat_df_pop.groupby(['country'], dropna=False)['country'].count()

country
Bahamas            1
France             1
Malaysia           5
Saudi Arabia    1172
Taiwan            39
USA              784
NaN                0
Name: country, dtype: int64

In [14]:
feat_df_pop.groupby(['country', 'isolation_source'], dropna=False)['isolation_source'].count()

country       isolation_source        
Bahamas       Tuna Alley (inside reef)       1
France        NaN                            0
Malaysia      NaN                            0
Saudi Arabia  Southern Red Sea            1172
Taiwan        Green Island                  17
              Ken Ting                      21
              Yeh Liu                        1
USA           Hillsboro Ledge                1
              Summerland Key, Florida       25
              West Maui                    758
NaN           NaN                            0
Name: isolation_source, dtype: int64

In [15]:
feat_df_pop.groupby(['host', 'country', 'isolation_source'], dropna=False)['id'].count()

host                                  country       isolation_source        
Acropora clathrata                    NaN           NaN                           1
Acropora humilis                      Saudi Arabia  Southern Red Sea            322
Corallium rubrum                      France        NaN                           1
Eunicea fusca (gorgonian coral)       USA           Hillsboro Ledge               1
Eunicella cavolini                    NaN           NaN                          20
Lobophytum compactum                  NaN           NaN                           1
Marine tunicates                      Malaysia      NaN                           5
Plexaura homomalla (gorgonian coral)  USA           Summerland Key, Florida      25
Plexaura sp. (gorgonian coral)        Bahamas       Tuna Alley (inside reef)      1
Pocillopora damicornis                Saudi Arabia  Southern Red Sea            496
Pocillopora verrucosa                 NaN           NaN                          58

## NCBI: Nucleotide databse

While **+1500** results is absolute enough to work with, I will expand the scope to include more sequences by searching directly through the nucleotide database.

I will combine the popset dataset and the findings below, and then remove duplicates.

### Search in NCBI (Nucleotide)

The review of literature revealed one nucleotide sequence that was not included by the search using `search_nucleo(...)` function, which found a match to a coral species found in the Red Sea {cite}`Pogoreutz2022`. We will manually note this, and add it to our list of sequences to download. 


Interestingly, looking more into the referenced sequence revealed that the original study had a total of 412 16S Endozoicomonas sequences to work with {cite}`Bayer_2013`. This will be added by adding the accession numbers by Popset (above).

Additionally, they {cite}`Bayer_2013` also provided a supplementary phylogenetic tree. We will use this to add more accession ids to our list of sequences to download.

```{figure} ../../outputs/full_analysis/bayer_2013_fig_s3_1pg_rotated.png
---
name: parsimony-tree-bayer-2013
---
Tree showing the phylogenetic relationship of Endozoicomonas to other bacterial species.
```

These are the accession IDs mentioned in the tree above.
| Accession ID | Status | Host | DOI |
|--------------|------- |----- |---- |
| GU118644  | Probable | Montastraea faveolata (coral) | {cite}`Sunagawa_2010` |
| GU118168  | Probable | Diploria strigosa (coral) |{cite}`Sunagawa_2010` |
| GU118379  | Probable | Gorgonia ventalina (coral) |{cite}`Sunagawa_2010` |
| GU118072  | Probable | Acropora palmata (coral) |{cite}`Sunagawa_2010` |
| GU118404  | Probable | Gorgonia ventalina (coral) |{cite}`Sunagawa_2010` |
| GU118957  | Probable | Porites astreoides (coral) |{cite}`Sunagawa_2010` |
| GU784983  | Probable | Sponge Ianthella basta |{cite}`Luter_2010` |
| GU118966  | Probable | Porites astreoides (coral) | {cite}`Sunagawa_2010` |
| AB695088  | Probable | Haplosclerida gen. et sp. (purple sponge) | {cite}`Nishijima_2013` |
| AM259915  | Probable | Chondrilla nucula (sponge) | {cite}`Thiel_2007` |
| DQ884169  | Probable (Uncultured Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ884170  | Probable (Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ884160  | Probable (Uncultured Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` | 
| DQ917901  | Probable | Muricea elongata (Octocoral) | {cite}`Ranzer2007` |
| AY700600  | Probable | Pocillopora damicornis (coral) | {cite}`Bourne_2005` |
| AY700601  | Probable | Pocillopora damicornis (coral) | {cite}`Bourne_2005` |
| FJ202634  | Probable, (uncultured bacterium) | Montastraea faveolata (coral) | {cite}`Sunagawa_2009` |
| FJ347758  | Probable | Montipora aequituberculata (coral) | {cite}`Yang_2010` | 
| FJ930289  | Probable, (Uncultured bacterium) | Porites compressa (coral) | {cite}`Speck_2012` |

> `DQ884169  (C19)` `DQ884170 (C23)`, `DQ884160 (CRNA5)` noted with "colony sequences C19, C23 and CRNA5 [...] formed a group related to Endozoiciomonas elysicola (97.6–99.7% similarity), a bacterium isolated from the marine mollusc Elysia ornata (M. Kurahashi, unpublished) {cite}`MartinezGarcia2006`"


From above tree, but will not be included.
| Accession ID | Status | Host | DOI |
|--------------|------- |----- |---- |
| AM503093  | Very unlikely (Marinobacter guineae) | Antarctic environment |  |
| AM229315  | Very unlikely (Halomonas janggokensis) | saline water |  |
| GU291858  | Unlikely (due to distance on tree) | solar saltern | {cite}`Joung2010` |
| AB205011  | Unlikely, Spongiobacter nickelotolerans | marine sponge | |
| AB196667  | Probable, not included | Elysia ornata (sea slug) | {cite}`Kurahashi_2007` |
| DQ917830  | Unlikely (Spongiobacter) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ917877  | Unlikely (Spongiobacter) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ917879  | Unlikely (Spongiobacter) | Muricea elongata (Octocoral) | {cite}`Ranzer2007` |
| GQ853555  | Probable, not included | Loripes lacteus (clam) | gill symbiont | {cite}`Mausz2010` |
| FM162182  | Probable, not included | Bathymodiolus brooksi (mussels) | {cite}`Zielinski_2009` |
| FM163188  | Probable, not included | Bathymodiolus brooksi (mussels) | {cite}`Zielinski_2009` |
| FJ154998  | Probable, (uncultured bacterium) not included | ocean water | | 
| EU884930  | Probable, not included | Sixbar angelfish | |

Here are other accession IDs that were found when examining the references of the papers above.
| Accession ID | DOI |
|--------------|-----|
| AB695089 | {cite}`Nishijima_2013` |

In [16]:
def search_nucleo(limit: int, additional_ids: list[str] = []):
  """
  Search for the nucleotide database for nucleotides that mention coral and Endozoicomonas.

  :param `limit`: The maximum number of results to return.
  """

  term = "(coral[All Fields] AND 16S[Title]) AND Endozoicomonas[Organism]"
  try:
    handle = Entrez.esearch(db="nucleotide",
      term=term,
      retmax=limit,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      logger.debug(ids)
      handle = Entrez.efetch(db="nucleotide", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(http_e.read())
    logger.exception(http_e)
    return 0, [], []
  except Exception as e:
    logger.error(e)
    logger.exception(e)
    return 0, [], []

In [17]:
additional_accessions = ['AB695089']
count_nuc, idlist_nuc, records_nuc = search_nucleo(4000, additional_accessions)

### Summary of nucleotide database

We will save the metadata from the nucleotide db into [features_of_nucleotide_seq.csv](../../datasets/full_analysis/features_of_nucleotide_seq.csv).

Genbank file was saved as [nucleotide_seq.gb](../../datasets/full_analysis/nucleotide_seq.gb).
Fasta file was saved as [nucleotide_seq.fasta](../../datasets/full_analysis/nucleotide_seq.fasta).

In [18]:
from pandas import concat

DEBUG = False
feat_df_nuc0, recs_nuc = table_features(count_nuc, records_nuc, "nucleotide", 5, DEBUG)
feat_df_nuc0.to_csv(os.path.join(dir_path, "features_of_nucleotide_seq.csv"), index=False)

INFO:download_16s:Records to save: 154/154


We will save the combined metadata to [feat_all_seq.csv](../../datasets/full_analysis/feat_all_seq.csv).

In [19]:
feat_df_all0 = concat([feat_df_nuc0, feat_df_pop0]).drop_duplicates(subset="id")
feat_df_all0.to_csv(os.path.join(dir_path, "feat_all_seq.csv"), index=False)

In [20]:
def unique(to_combine: list[list[SeqRecord]], all_df: DataFrame):
    idlist = all_df['id'].to_list()
    # intilize a null list
    keep_list: list[SeqRecord] = []

    todo_list: list[SeqRecord] = []

    already_found_list: list[str] = []

    for todo in to_combine:
        todo_list.extend(todo)

    # traverse for all elements
    for x in todo_list:
        if x.id in already_found_list:
          continue
        else:
          already_found_list.append(x.id)
        # check if exists in unique_list or not
        if x.id in idlist:
            keep_list.append(x)

    return keep_list

The genbank and sequence files of all sequences will be saved to [all_seq.gb](../../datasets/full_analysis/all_seq.gb) and [all_seq.fasta](../../datasets/full_analysis/all_seq.fasta) respectively.

In [21]:
unique_recs = unique([recs_nuc, recs_pop], feat_df_all0)

logger.info(f"Unique records found: {len(unique_recs)} / {len(feat_df_all0['id'].to_list())}")
# Save the records to a file
save_file("all_seq.gb", unique_recs, "genbank")
save_file("all_seq.fasta", unique_recs, "fasta")

INFO:download_16s:Unique records found: 1854 / 1854


In [22]:
if DEBUG:
  col_interest = ['id', 'organism', 'strain', 'host', 'country', 'pubmed_ids', 'product', 'isolation_source']
else:
  col_interest = ['id', 'group', 'organism', 'host', 'country', 'pubmed_ids', 'isolation_source', 'iso_source2', 'product']

feat_df_all = feat_df_all0[col_interest]
feat_df_all.head()

Unnamed: 0,id,group,organism,host,country,pubmed_ids,isolation_source,iso_source2,product
0,OM273416.1,OM273,Endozoicomonas euniceicola,Eunicea flexuosa,USA,,Florida Keys,Florida Keys,16S ribosomal RNA
1,OM273412.1,OM273,Endozoicomonas gorgoniicola,Eunicea flexuosa,USA,,Florida Keys,Florida Keys,16S ribosomal RNA
2,OL957540.1,OL957,uncultured Endozoicomonas sp.,Stylophora pistillata,Japan,,Okinawa,Okinawa,16S ribosomal RNA
3,OL957539.1,OL957,uncultured Endozoicomonas sp.,Stylophora pistillata,Japan,,Okinawa,Okinawa,16S ribosomal RNA
4,OL957538.1,OL957,uncultured Endozoicomonas sp.,Stylophora pistillata,Japan,,Okinawa,Okinawa,16S ribosomal RNA


### Geographic details

Here shows the breakdown by country, and by isolation source as reported by the metadata.

In [23]:
feat_df_all.groupby(['country'], dropna=False)['country'].count()

country
Bahamas           1
France            1
Japan            40
Malaysia          5
Saudi Arabia    586
Taiwan           69
USA             786
NaN               0
Name: country, dtype: int64

In [24]:
feat_df_all.groupby(['country', 'isolation_source'], dropna=False)['isolation_source'].count()

country       isolation_source        
Bahamas       Tuna Alley (inside reef)      1
France        NaN                           0
Japan         Okinawa                      39
              Shizuoka, Numazu              1
Malaysia      NaN                           0
Saudi Arabia  Southern Red Sea            586
Taiwan        Green Island                 17
              Ken Ting                     21
              Kenting                      28
              Yeh Liu                       1
              NaN                           0
USA           Florida Keys                  2
              Hillsboro Ledge               1
              Summerland Key, Florida      25
              West Maui                   758
NaN           NaN                           0
Name: isolation_source, dtype: int64

In [25]:
feat_df_all.groupby(['country', 'isolation_source', 'host'], dropna=False)['id'].count()

country       isolation_source          host                                                   
Bahamas       Tuna Alley (inside reef)  Plexaura sp. (gorgonian coral)                               1
France        NaN                       Corallium rubrum                                             1
Japan         Okinawa                   Stylophora pistillata                                       39
              Shizuoka, Numazu          Isolated from purple sponge (Haplosclerida gen. et sp.)      1
Malaysia      NaN                       Marine tunicates                                             5
Saudi Arabia  Southern Red Sea          Acropora humilis                                           161
                                        Pocillopora damicornis                                     248
                                        Stylophora pistillata                                      177
Taiwan        Green Island              Stylophora pistillata (coral)           

### Fixing isolation_source column

By the inspection above, we know that we need make [a list](../../datasets/full_analysis/fix_list_gran_5.csv) of `isolation_source` rows to fix manually by reading the original literature it was published in.

`OL957` based results can be traced back to this paper by {cite:p}`Wada_2022`.

In [26]:
def create_fix_csv(df: DataFrame, csv_file_name="fix_list.csv"):
  manual_fix = ['country', 'isolation_source','group', 'id']
  df2 = df[manual_fix].copy()
  df2 = df2.groupby(['country', 'isolation_source','group'], dropna=False).first().reset_index(drop=True)

  df3 = merge(df2, df, on=['id'], how='left')
  df3[manual_fix].to_csv(os.path.join(dir_path, csv_file_name), index=False, index_label=False, na_rep="")

  return df3

feat_df_nuc2 = create_fix_csv(feat_df_all, "fix_list_gran_5.csv")
