# Download: 16S rRNA sequences

Since 16S shows more promising results of identifying phylogeny of bacterial taxa, I will download a new set, but specifically look at entries under "popset" first. If there aren't enough sequences, I will expand the scope to include more sequences.

## Setup the environment

We will utilize the package [`BioPython`](https://biopython.org/wiki/Documentation) {cite}`Cock_2009` to interface with sequencial data hosted on NCBI. We will also use `pandas` {cite}`pandasdevelopmentteam2023, McKinney2010` to store metadata in a dataframe.

In [1]:
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, Reference
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq, UndefinedSequenceError
from Bio.Entrez.Parser import DictionaryElement
from Bio.Entrez import HTTPError

import os
import logging
import re

from pandas import DataFrame, notna, merge, concat
import numpy as np

logging.basicConfig(level=logging.DEBUG)

In [2]:
from Bio import Entrez
Entrez.email = "kaedeito@student.ubc.ca"
# This line sets the name of the tool that is making the queries
Entrez.tool = "download_16s.ipynb"

SEARCH_NEW = False
GRANULARITY = 5
logger = logging.getLogger("download_16s")

Create a folder to save our fasta and genbank file.

In [3]:
dir_path = os.path.realpath("..\\..\\datasets\\full_analysis")
os.makedirs(dir_path, exist_ok=True)

## Clean and wrangle tables

We need to clean the data to make it easier to work with.
We need to load the metadata into a tabular format to make it easier to find problems, or to find interesting patterns.

In [4]:
def clean_table(df: DataFrame, granularity:int):
  """
  Clean the dataframe by removing rows and cleaning string formatting.

  :param `df`: The dataframe to clean.
  :param `granularity`: The number of characters to group the ids by.
  """
  feat_df = df.copy()

  # Split the country column into two columns
  feat_df[['country','iso_source2']] = feat_df['country'].str.split(':',expand=True)
  # Trim whitespace from the isolation_source column
  feat_df['iso_source2'] = feat_df['iso_source2'].map(lambda x: x.strip() if isinstance(x, str) else x)

  # Group the ids by the first n characters
  feat_df['group'] = feat_df['id'].str[:granularity]

  # Fill the isolation_source column with the iso_source2 column if it is null
  feat_df['isolation_source'] = feat_df['isolation_source'].fillna(feat_df['iso_source2'])

  feat_df.loc[feat_df['group'] == 'KC668', 'country'] = 'Saudi Arabia'
  feat_df.loc[feat_df['group'] == 'KC668', 'isolation_source'] = 'Southern Red Sea'

  # Clean the table by removing the rows that do not contain Endozoicomonas
  feat_df = feat_df[notna(feat_df['organism']) & (feat_df['organism'].astype(str).str.contains('Endozoicomonas') | feat_df['organism'].astype(str).str.contains('uncultured bacterium'))]

  # Clean the table by removing the rows that do not contain 16S
  feat_df = feat_df[notna(feat_df['product']) & (feat_df['product'].astype(str).str.contains('16S'))]

  # Copy the colony info from the isolation_source column
  if 'colony' in feat_df.columns:
    feat_df['strain'] = feat_df['strain'].fillna(feat_df['colony'])

  if 'BioProject' not in feat_df.columns:
    feat_df['BioProject'] = np.nan

  # Split the db_xref column into multiple columns, and use the first column as column name
  feat_df_nuc_xref = feat_df[['id', 'db_xref']].copy()
  feat_df_nuc_xref[['xref','value']] = feat_df['db_xref'].str.split(':',expand=True)

  feat_df_nuc_xref = feat_df_nuc_xref.drop_duplicates(subset=['id']).reset_index(drop=True)
  feat_df_nuc_xref = feat_df_nuc_xref.pivot(index=['id'], columns='xref', values='value')
  feat_df = merge(feat_df, feat_df_nuc_xref, on="id")

  # replace every "Ken Ting" with "Kenting"
  feat_df['isolation_source'].replace('Ken Ting', 'Kenting', inplace=True)

  # remove the columns that are not needed
  feat_df.drop(columns=['db_xref'], inplace=True)

  # Reset the index
  feat_df.reset_index()

  return feat_df

In [5]:
def unique(to_combine: list[list[SeqRecord]], all_df: DataFrame):
    idlist = all_df['id'].to_list()
    # intilize a null list
    keep_list: list[SeqRecord] = []

    todo_list: list[SeqRecord] = []

    already_found_list: list[str] = []

    for todo in to_combine:
        todo_list.extend(todo)

    # traverse for all elements
    for x in todo_list:
        if x.id in already_found_list:
          continue
        else:
          already_found_list.append(x.id)
        # check if exists in unique_list or not
        if x.id in idlist:
            keep_list.append(x)

    return keep_list

In [6]:
def save_file(file_name, records, rec_type):
  """
  Save the records to the file.

  `file_name`: The name of the file to save to.

  `records`: The records to save.

  `rec_type`: The type of record to save.
  """
  file_path = os.path.join(dir_path, file_name)
  with open(file_path, "w") as out_handle:
      try:
        count = SeqIO.write(records, out_handle, rec_type)
        logger.debug(f"{rec_type} {count} Saved")
      except UndefinedSequenceError as e_seq:
        logger.error(e_seq)
        logger.error(f"Failed to write {rec_type} to file {file_name}")
      except Exception as e:
        logger.error(e)
        logger.error(f"Failed to write {rec_type} to file {file_name}")

In [7]:
def table_features(count: int, records: list[SeqRecord], rec_type: str, granularity=5, debug: bool = False):
  """
  1. Extracts the features (both source and rRNA)
  2. Based on cleaned up list of features, filter records for matching ids
  3. Save the records to a file
  """
  logger.debug(f"Found {count} {rec_type}, sequences: {len(records)}")
  qualifiers: list[dict[str, str]] = []

  save_records: list[SeqRecord] = []

  for record in records:
    feats: list[SeqFeature] = record.features

    [source] = filter(lambda f: f.type == "source", feats)
    if not source:
       raise Exception("No source feature")

    # Copy the qualifiers and add the id
    # merge the rRNA qualifiers to the same dict
    qual = source.qualifiers.copy()
    qual["id"] = record.id

    seq_desc = record.description.split(" 16S")
    qual["sequence_name"] = seq_desc[0]
    record.description = seq_desc[0]

    # Filter for the rRNA features
    rRNA_filtered = list(filter(lambda f: f.type == "rRNA", feats))
    if len(rRNA_filtered) == 0:
       logger.debug("No rRNA feature")
       continue
    elif 'contig' in record.annotations:
      logger.debug("Is a contig")
      continue
    else:
      for rRNA in rRNA_filtered:
        try:
          qual.update(rRNA.qualifiers)
        except Exception as e:
           logger.error(rRNA)
           logger.exception(e)

      reference_list: list[Reference] = record.annotations.get('references', [])
      # Get the pubmed id list from the references
      pubmed_ids = [ref.pubmed_id for ref in reference_list if ref.pubmed_id]
      qual['pubmed_ids'] = pubmed_ids

      # Split the dbxref into a dict
      for dbxref in record.dbxrefs:
        split = dbxref.split(":")
        if len(split) > 1:
          qual[split[0]] = split[1]

      # Since all feature values are a list of string, (usually of length 1), convert to string
      for key in qual:
        qual_list = qual[key]
        if isinstance(qual_list, list):
          qual[key] = "; ".join(qual_list)

      # Extract the colony info from the isolation_source
      qual_iso_source = qual.get('isolation_source', '')
      if "Coral-associated microbial aggregate" in qual_iso_source:
        # logger.debug(f"colony info found for {record.id}")
        iso_qual: str = qual['isolation_source']

        [res] = re.findall(r'(?<=\().+?(?=\))', iso_qual)

        qual['isolation_source'] = None
        qual['colony'] = res
      # If the isolation_source mentions the host, move to the info to host column instead
      elif 'coral' in qual_iso_source or 'sponge' in qual_iso_source:
        qual['host'] = qual_iso_source
        qual['isolation_source'] = None

      # clean up host info
      qual_host: str = qual.get('host', '')
      brk_results = re.findall(r"(?<=\().+?(?=\))", qual_host)
      if len(brk_results) > 0:
        # for cases like "<Species name> (gorgorian coral)", remove trailing text in the brackets
        if "coral" in brk_results[0]:
          qual['host'] = qual_host.split(" (")[0]
        # For cases like "<details> (Species name)", extract the species name
        else:
          qual['host'] = brk_results[0]
      if qual.get('host') == None:
        qual['host'] = np.nan
      elif qual.get('host') == '':
        qual['host'] = np.nan
      elif qual.get('host') and qual.get('host') != np.nan:
        qual['host'] = qual['host'].strip().capitalize()
        if qual['host'] == 'Sea coral':
          qual['host'] = 'Coral'

      # As a safe guard, check that the sequence is not empty
      if record.seq:
        sequence: Seq = record.seq
        qual["sequence_length"] = len(sequence)
        save_records.append(record)

      qualifiers.append(qual)

  # Load features into a dataframe
  df1 = DataFrame(qualifiers)
  # Clean the table by replacing "None" and "nan"
  df1.replace("None", value=np.nan, inplace=True)
  df1.replace(np.NaN, value=np.nan, inplace=True)
  df1.replace("nan", value=np.nan, regex=True, inplace=True)

  # If not in debug mode, clean the table + save the files
  if debug:
    logger.debug(df1.head())
    return df1, save_records
  else:
    df2 = clean_table(df1, granularity)
    # Filter the records by the ids in the dataframe
    filtered_recs = list(filter((lambda x: x.id in df2['id'].to_list()), save_records))

    logger.info(f"Records to save: {len(filtered_recs)}/{len(save_records)}")

    # Save the records to a file
    save_file(f"{rec_type}_seq.gb", filtered_recs, "genbank")
    save_file(f"{rec_type}_seq.fasta", filtered_recs, "fasta")

    return df2, filtered_recs


## NCBI: Popset

### Search in NCBI (Popset)

As noted by the NCBI's website, their `Popset database` is a collection of related sequences that is sourced from a single population/phylogenetic/mutation/ecosystem study. 

This is a useful grouping, as we would be able to extract the locational data from the metadata and use it to create a map of the distribution of the bacteria.

In [8]:
def search_popset(additional_ids: list[str] = []):
  """
  Search for the popset database for the coral and Endozoicomonas.
  :return: The number of results and the list of ids.
  """
  try:
    term = f"((Endozoicomonas[Organism] OR Endozoicomonas[All Fields]) AND coral[All Fields]"
    logger.debug(term)
    handle = Entrez.esearch(db="popset",
      term=term,
      retmax=20,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      handle = Entrez.efetch(db="popset", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(str(http_e))
    return 0, [], []
  except Exception as e:
    logger.error(e)
    return 0, [], []

After review of literature that happened in the inspection by Nucleotide DB, I found some popset that were worth adding manually.

This includes the popset coming out of the work of {cite:p}`Speck_2012` (`"227461503"`), and the work of {cite:p}`Bayer_2013` (`"510829312"`).

In [9]:
additional_ids_pop = [
  '227461503',
  '510829312',
]

In [10]:
if SEARCH_NEW:
  count_pop, idlist_pop, records_pop = search_popset(additional_ids_pop)
else:


  records_pop = list(SeqIO.parse(os.path.join(dir_path, "popset_seq.gb"), "gb"))
  count_pop = len(records_pop)
  idlist_pop = [record.id for record in records_pop]

### Summary of popset

Metadata table created for the data out of Popset database was saved to [features_of_popset_seq.csv](../../datasets/full_analysis/features_of_popset_seq.csv).

Genbank file was saved as [popset_seq.gb](../../datasets/full_analysis/popset_seq.gb).
Fasta file was saved as [popset_seq.fasta](../../datasets/full_analysis/popset_seq.fasta).

In [11]:
# Load the metadata, and save the records to a file
feat_df_pop0, recs_pop = table_features(count_pop, records_pop, "popset", granularity=GRANULARITY, debug=False)

feat_df_pop0.to_csv(os.path.join(dir_path, "features_of_popset_seq.csv"), index=False)

feat_df_pop = feat_df_pop0.copy()
col_interest = ['id', 'sequence_name', 'group', 'organism', 'strain', 'host', 'country', 'taxon', 'BioProject', 'pubmed_ids', 'isolation_source', 'product']
feat_df_pop = feat_df_pop[col_interest]

DEBUG:download_16s:Found 2640 popset, sequences: 2640
INFO:download_16s:Records to save: 2640/2640
DEBUG:download_16s:genbank 2640 Saved
DEBUG:download_16s:fasta 2640 Saved


### Geographic details

In [12]:
feat_df_pop.groupby(['country', 'isolation_source','host'], dropna=False)['id'].count()

country       isolation_source          host                  
Bahamas       Tuna Alley (inside reef)  Plexaura sp.                1
France        NaN                       Corallium rubrum            1
Malaysia      NaN                       Coral reef ecosystems       5
Saudi Arabia  Southern Red Sea          Acropora humilis          322
                                        Pocillopora damicornis    496
                                        Stylophora pistillata     354
Taiwan        Green Island              Stylophora pistillata      17
              Kenting                   Stylophora pistillata      21
              Yeh Liu                   Stylophora pistillata       1
USA           Hillsboro Ledge           Eunicea fusca               1
              Summerland Key, Florida   Plexaura homomalla         25
              West Maui                 Porites compressa         758
NaN           NaN                       Coral                      58
                           

## NCBI: Nucleotide databse

While **+1500** results is absolute enough to work with, I will expand the scope to include more sequences by searching directly through the nucleotide database.

I will combine the popset dataset and the findings below, and then remove duplicates.

### Search in NCBI (Nucleotide)

The review of literature revealed one nucleotide sequence that was not included by the search using `search_nucleo(...)` function, which found a match to a coral species found in the Red Sea {cite}`Pogoreutz2022`. We will manually note this, and add it to our list of sequences to download. 


Interestingly, looking more into the referenced sequence revealed that the original study had a total of 412 16S Endozoicomonas sequences to work with {cite}`Bayer_2013`. This will be added by adding the accession numbers by Popset (above).

Additionally, they {cite}`Bayer_2013` also provided a supplementary phylogenetic tree. We will use this to add more accession ids to our list of sequences to download.

```{figure} ../../outputs/full_analysis/bayer_2013_fig_s3_1pg_rotated.png
---
name: parsimony-tree-bayer-2013
---
Tree showing the phylogenetic relationship of Endozoicomonas to other bacterial species.
```

These are the accession IDs mentioned in the tree above.
| Accession ID | Status | Host | DOI |
|--------------|------- |----- |---- |
| GU118644  | Probable | Montastraea faveolata (coral) | {cite}`Sunagawa_2010` |
| GU118168  | Probable | Diploria strigosa (coral) |{cite}`Sunagawa_2010` |
| GU118379  | Probable | Gorgonia ventalina (coral) |{cite}`Sunagawa_2010` |
| GU118072  | Probable | Acropora palmata (coral) |{cite}`Sunagawa_2010` |
| GU118404  | Probable | Gorgonia ventalina (coral) |{cite}`Sunagawa_2010` |
| GU118957  | Probable | Porites astreoides (coral) |{cite}`Sunagawa_2010` |
| GU784983  | Probable | Sponge Ianthella basta |{cite}`Luter_2010` |
| GU118966  | Probable | Porites astreoides (coral) | {cite}`Sunagawa_2010` |
| AB695088  | Probable | Haplosclerida gen. et sp. (purple sponge) | {cite}`Nishijima_2013` |
| AM259915  | Probable | Chondrilla nucula (sponge) | {cite}`Thiel_2007` |
| DQ884169  | Probable (Uncultured Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ884170  | Probable (Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ884160  | Probable (Uncultured Gammaproteobacteria) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` | 
| DQ917901  | Probable | Muricea elongata (Octocoral) | {cite}`Ranzer2007` |
| AY700600  | Probable | Pocillopora damicornis (coral) | {cite}`Bourne_2005` |
| AY700601  | Probable | Pocillopora damicornis (coral) | {cite}`Bourne_2005` |
| FJ202634  | Probable, (uncultured bacterium) | Montastraea faveolata (coral) | {cite}`Sunagawa_2009` |
| FJ347758  | Probable | Montipora aequituberculata (coral) | {cite}`Yang_2010` | 
| FJ930289  | Probable, (Uncultured bacterium) | Porites compressa (coral) | {cite}`Speck_2012` |

> `DQ884169  (C19)` `DQ884170 (C23)`, `DQ884160 (CRNA5)` noted with "colony sequences C19, C23 and CRNA5 [...] formed a group related to Endozoiciomonas elysicola (97.6–99.7% similarity), a bacterium isolated from the marine mollusc Elysia ornata (M. Kurahashi, unpublished) {cite}`MartinezGarcia2006`"


From above tree, but will not be included.
| Accession ID | Status | Host | DOI |
|--------------|------- |----- |---- |
| AM503093  | Very unlikely (Marinobacter guineae) | Antarctic environment |  |
| AM229315  | Very unlikely (Halomonas janggokensis) | saline water |  |
| GU291858  | Unlikely (due to distance on tree) | solar saltern | {cite}`Joung2010` |
| AB205011  | Unlikely, Spongiobacter nickelotolerans | marine sponge | |
| AB196667  | Probable, not included | Elysia ornata (sea slug) | {cite}`Kurahashi_2007` |
| DQ917830  | Unlikely (Spongiobacter) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ917877  | Unlikely (Spongiobacter) | Cystodytes dellechiajei (tunicate) | {cite}`MartinezGarcia2006` |
| DQ917879  | Unlikely (Spongiobacter) | Muricea elongata (Octocoral) | {cite}`Ranzer2007` |
| GQ853555  | Probable, not included | Loripes lacteus (clam) | gill symbiont | {cite}`Mausz2010` |
| FM162182  | Probable, not included | Bathymodiolus brooksi (mussels) | {cite}`Zielinski_2009` |
| FM163188  | Probable, not included | Bathymodiolus brooksi (mussels) | {cite}`Zielinski_2009` |
| FJ154998  | Probable, (uncultured bacterium) not included | ocean water | | 
| EU884930  | Probable, not included | Sixbar angelfish | |

Here are other accession IDs that were found when examining the references of the papers above.
| Accession ID | DOI |
|--------------|-----|
| AB695089 | {cite}`Nishijima_2013` |

In [13]:
def search_nucleo(limit: int, additional_ids: list[str] = []):
  """
  Search for the nucleotide database for nucleotides that mention coral and Endozoicomonas.

  :param `limit`: The maximum number of results to return.
  """

  term = "(coral[All Fields] AND 16S[Title]) AND Endozoicomonas[Organism]"
  try:
    handle = Entrez.esearch(db="nucleotide",
      term=term,
      retmax=limit,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      logger.debug(ids)
      handle = Entrez.efetch(db="nucleotide", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(http_e.read())
    logger.exception(http_e)
    return 0, [], []
  except Exception as e:
    logger.error(e)
    logger.exception(e)
    return 0, [], []

In [14]:
additional_accessions = ['AB695089', 'HE818343']
if SEARCH_NEW:
  count_nuc, idlist_nuc, records_nuc = search_nucleo(4000, additional_accessions)
else:
  records_nuc = list(SeqIO.parse(os.path.join(dir_path, "nucleotide_seq.gb"), "gb"))
  count_nuc = len(records_nuc)
  idlist_nuc = [record.id for record in records_pop]

### Summary of nucleotide database

We will save the...
| Type of data | File name |
|--------------|-----------|
| Metadata | [features_of_nucleotide_seq.csv](../../datasets/full_analysis/features_of_nucleotide_seq.csv) |
| Genbank | [nucleotide_seq.gb](../../datasets/full_analysis/nucleotide_seq.gb) |
| Fasta | [nucleotide_seq.fasta](../../datasets/full_analysis/nucleotide_seq.fasta) |

#### Everything combined (nucleotide + popset)

We will save the...
| Type of data | File name |
|--------------|-----------|
| Metadata | [feat_all_seq.csv](../../datasets/full_analysis/feat_all_seq.csv) |
| Genbank | [all_seq.gb](../../datasets/full_analysis/all_seq.gb) |
| Fasta | [all_seq.fasta](../../datasets/full_analysis/all_seq.fasta) |

In [15]:
from pandas import read_excel

latlng_file = read_excel(os.path.join(dir_path, "../../datasets/full_analysis/handwritten_latlng.xlsx"), sheet_name="Sheet1")
latlng_file['country'] = latlng_file['country'].astype(str)
# switch every "nan" to np.nan
latlng_file.replace("nan", value=np.nan, inplace=True)

In [16]:
# Read excel file

def annotate_groups(df0: DataFrame):
  df = df0.copy()
  # First, saudi arabia
  # Fix group KC669

  is_acropora_h = df['host'] == 'Acropora humilis'
  is_pocillopora_d = df['host'] == 'Pocillopora damicornis'

  df.loc[(df['country'] == 'Saudi Arabia') & is_acropora_h, 'latlng'] = "19°52'24.48'N, 40°04'46.14'E"
  df.loc[(df['country'] == 'Saudi Arabia') & is_acropora_h, 'isolation_source'] = "Red Sea, Brown reef"
  df.loc[(df['country'] == 'Saudi Arabia') & is_pocillopora_d, 'latlng'] = "19°52'24.48'N, 40°04'46.14'E"
  df.loc[(df['country'] == 'Saudi Arabia') & is_pocillopora_d, 'isolation_source'] = "Red Sea, Brown reef"

  is_stylopora = df['host'] == 'Stylophora pistillata'

  is_site_5 = df['clone'].str.contains('05')
  site_5_ltlng = "18°40'30.36'N, 40°44'21.18'E"

  is_site_12 = df['clone'].str.contains('12')
  site_12_ltlng = "18°40'30.36'N, 40°44'21.18'E"

  is_site_14 = df['clone'].str.contains('14')
  site_14_ltlng = "19°53'52.74'N, 40° 0'53.46'E"

  is_site_15 = df['clone'].str.contains('15')
  site_15_ltlng = "19°53'15.42'N, 40°9'23.94'E"

  is_site_17 = df['clone'].str.contains('17')
  site_17_ltlng = "20°8'58.38'N, 40°14'7.50'E"
  is_kc669_668 = df['id'].str.contains('KC669') | df['id'].str.contains('KC668')
  df.loc[is_kc669_668, 'country'] = 'Saudi Arabia'
  df.loc[is_kc669_668 & is_stylopora & is_site_5, 'latlng'] = site_5_ltlng
  df.loc[is_kc669_668 & is_stylopora & is_site_12, 'latlng'] = site_12_ltlng
  df.loc[is_kc669_668 & is_stylopora & is_site_14, 'latlng'] = site_14_ltlng
  df.loc[is_kc669_668 & is_stylopora & is_site_15, 'latlng'] = site_15_ltlng
  df.loc[is_kc669_668 & is_stylopora & is_site_17, 'latlng'] = site_17_ltlng
  df.loc[is_kc669_668 & is_stylopora, 'isolation_source'] = "Southern Red Sea"

  # remove extra maui
  is_Maui = df['isolation_source'] == 'West Maui'

  has_c7 = df['clone'] == 'C7-A01c'
  keep_df = df.loc[is_Maui & has_c7].copy()

  df = df.loc[~is_Maui]
  df = concat([df, keep_df])

  # deal with australia
  df.loc[df['id'].str.contains('KM3604'), 'country'] = 'Australia'
  df.loc[df['id'].str.contains('KM3604'), 'isolation_source'] = 'Orpheus Island Great Barrier Reef'

  # deal with taiwan
  is_taiwan_group = df['id'].str.contains('JN635')
  is_green_island = df['isolation_source'] == 'Green Island'
  # Split the clone column
  df['clone_num'] = df.loc[is_taiwan_group & is_green_island]['clone'].str.split('U', expand=True)[1].astype(int)
  # if the clone num is less than 1000, it's a Kunguan group
  is_kunguan_group = df['clone_num'] < 1000
  # otherwise it's a Chaikou, Green Island
  is_chaikou_group = df['clone_num'] >= 1000
  df.loc[is_taiwan_group & is_green_island & is_kunguan_group, 'isolation_source'] = "Kunguan, Green Island"
  df.loc[is_taiwan_group & is_green_island & is_chaikou_group, 'isolation_source'] = "Chaikou, Green Island"

  # Remove some of the extra taiwan
  to_remove = ['NR_116609.1', 'NR_158127.1', 'NR_169415.1']
  df = df.loc[~df['id'].isin(to_remove)]

  for index, row in latlng_file.iterrows():
    group = row['group']
    id_is_group = df['id'].astype(str).str.contains(group)
    df.loc[id_is_group, 'doi'] = row['doi']

    country = row['country']

    if isinstance(country, str):
      df.loc[id_is_group & (df['country'].isna()), 'country'] = row['country']

    if group != 'JN635':
      df.loc[id_is_group, 'collection_date'] = row['date']
      df.loc[id_is_group, 'latlng'] = row['latlng']
      df.loc[id_is_group, 'isolation_source'] = row['source']

    else:
      find_matching_taiwan = id_is_group & (df['isolation_source'] == row['source'])
      df.loc[find_matching_taiwan, 'collection_date'] = row['date']
      df.loc[find_matching_taiwan, 'latlng'] = row['latlng']
      df.loc[find_matching_taiwan, 'isolation_source'] = row['source']

  # clean up
  df.replace("None", value=np.nan, inplace=True)
  df.replace(np.NaN, value=np.nan, inplace=True)
  df.replace("nan", value=np.nan, regex=True, inplace=True)
  df.replace("NaN", value=np.nan, regex=True, inplace=True)
  df.replace("", value=np.nan, inplace=True)
  return df


In [17]:
feat_df_nuc0, recs_nuc = table_features(count_nuc, records_nuc, "nucleotide", granularity=GRANULARITY, debug=False)
feat_df_nuc0.to_csv(os.path.join(dir_path, "features_of_nucleotide_seq.csv"), index=False)

DEBUG:download_16s:Found 155 nucleotide, sequences: 155
INFO:download_16s:Records to save: 155/155
DEBUG:download_16s:genbank 155 Saved
DEBUG:download_16s:fasta 155 Saved


In [26]:
feat_df_all = concat([feat_df_nuc0, feat_df_pop0]).drop_duplicates(subset="id")
logger.info(f"Records found: {len(feat_df_all['id'].to_list())}")
feat_df_all = annotate_groups(feat_df_all)
logger.info(f"Records found: {len(feat_df_all['id'].to_list())}")
feat_df_all.to_csv(os.path.join(dir_path, "feat_all_seq.csv"), index=False)

col_interest = ['id', 'country', 'group', 'host', 'isolation_source', 'sequence_length', 'clone', 'collection_date', 'latlng']
# ['id', 'clone', 'sequence_name', 'group', 'organism', 'host', 'country', 'pubmed_ids', 'isolation_source','sequence_length']
feat_df_all = feat_df_all[col_interest].sort_values(by=['sequence_length'], ascending=False).sort_values(by=['id']).reset_index(drop=True)

INFO:download_16s:Records found: 1855


INFO:download_16s:Records found: 1095


In [27]:
unique_recs = unique([recs_nuc, recs_pop], feat_df_all)
logger.info(f"Unique records found: {len(unique_recs)} / {len(feat_df_all['id'].to_list())}")
# Save the records to a file
save_file("all_seq.gb", unique_recs, "genbank")
save_file("all_seq.fasta", unique_recs, "fasta")

INFO:download_16s:Unique records found: 1095 / 1095
DEBUG:download_16s:genbank 1095 Saved
DEBUG:download_16s:fasta 1095 Saved


### Geographic details

Here shows the breakdown by country, and by isolation source as reported by the metadata.

In [24]:
feat_df_all.groupby(['country'], dropna=False)['id'].count()

country
Australia         3
Bahamas           1
France           21
Japan            40
Kuwait            1
Malaysia          5
Portugal          1
Saudi Arabia    924
Taiwan           70
USA              29
Name: id, dtype: int64

In [25]:
feat_df_all.groupby(['country', 'isolation_source'], dropna=False)['isolation_source'].count()

country       isolation_source                   
Australia     Orpheus Island Great Barrier Reef        3
Bahamas       Bimini, Tuna Alley                       1
France        Bay of Villefranche-sur-Mer             20
              Porticcio, Corsica Island                1
Japan         Okinawa                                 39
              Shizuoka, Numazu                         1
Kuwait        Qit'at Bnaider                           1
Malaysia      Bidong Island, Terengganu                5
Portugal      Algarve, Gale Alta, Armacao de Pera      1
Saudi Arabia  Al-Fahal reef                           58
              Red Sea                                  2
              Red Sea, Brown reef                    409
              Southern Red Sea                       455
Taiwan        Chaikou, Green Island                   10
              Kenting                                 52
              Kunguan, Green Island                    7
              Yeh Liu                 

In [21]:
# Find all where latlng is nan
feat_df_all.loc[feat_df_all['latlng'].isna()]


Unnamed: 0,id,country,group,host,isolation_source,sequence_length,clone,collection_date,latlng


In [22]:
grouping = ['country', 'group', 'host', 'isolation_source']
feat_df_all.groupby(grouping, dropna=False)['id'].count()

country       group  host                       isolation_source                   
Australia     KM360  Coral mucus                Orpheus Island Great Barrier Reef        3
Bahamas       JX488  Plexaura sp.               Bimini, Tuna Alley                       1
France        JQ691  Eunicella cavolini         Bay of Villefranche-sur-Mer             20
              KT964  Corallium rubrum           Porticcio, Corsica Island                1
Japan         AB695  Haplosclerida gen. et sp.  Shizuoka, Numazu                         1
              OL957  Stylophora pistillata      Okinawa                                 39
Kuwait        HM804  Coral mucus                Qit'at Bnaider                           1
Malaysia      MG896  Coral reef ecosystems      Bidong Island, Terengganu                5
Portugal      HE818  Marine sponge              Algarve, Gale Alta, Armacao de Pera      1
Saudi Arabia  KC668  Acropora humilis           Red Sea, Brown reef                    161
      

In [32]:
to_align_df = feat_df_all.groupby(grouping, dropna=False).head(10)
to_align_df

Unnamed: 0,id,country,group,host,isolation_source,sequence_length,clone,collection_date,latlng
0,AB695089.1,Japan,AB695,Haplosclerida gen. et sp.,"Shizuoka, Numazu",1468,,2001-06-01,"35°03'51.8""N, 138°49'04.8""E"
1,FJ347758.1,Taiwan,FJ347,Coral,Kenting,1464,,2009-08-01,"21°54'25.9""N, 120°46'09.0""E"
2,FJ930289.1,USA,FJ930,Porites compressa,West Maui patch reef,1412,C7-A01c,2009-04-01,"20°48.399'N, 156° 36.064'W"
3,HE818343.1,Portugal,HE818,Marine sponge,"Algarve, Gale Alta, Armacao de Pera",1406,,2010-06-16,"37°04'09.6''N, 8°19'52.1''W"
4,HM804435.1,Kuwait,HM804,Coral mucus,Qit'at Bnaider,445,,2008-07-01,"28°47'52.9""N, 48°18'59.4""E"
...,...,...,...,...,...,...,...,...,...
1061,OL957509.1,Japan,OL957,Stylophora pistillata,Okinawa,416,OS2-3-2,2017-10-01,"26˚37'42.2''N, 127˚51'35.6''E"
1062,OL957510.1,Japan,OL957,Stylophora pistillata,Okinawa,419,OS2-4-1,2017-10-01,"26˚37'42.2''N, 127˚51'35.6''E"
1063,OL957511.1,Japan,OL957,Stylophora pistillata,Okinawa,419,OS2-4-2,2017-10-01,"26˚37'42.2''N, 127˚51'35.6''E"
1093,OM273412.1,USA,OM273,Eunicea flexuosa,"Florida Keys, Florida",1284,,2019-10-01,"25°11.9''N, 80°30.2''W"


In [33]:
to_align = [rec for rec in unique_recs if rec.id in to_align_df['id'].to_list()]

save_file("to_align.fasta", to_align, "fasta")

DEBUG:download_16s:fasta 140 Saved


## Align the 16S sequences

Use muscle to align the sequences.

```bash
muscle5.1 -align ./datasets/full_analysis/to_align.fasta -output ./outputs/full_analysis/muscle/aligned_top_140.fasta
```