# Download: 16S rRNA sequences

Since 16S shows more promising results of identifying phylogeny of bacterial taxa, I will download a new set, but specifically look at entries under "popset" first. If there aren't enough sequences, I will expand the scope to include more sequences.

## Setup the environment

We will utilize the package [`BioPython`](https://biopython.org/wiki/Documentation) {cite}`Cock_2009` to interface with sequencial data hosted on NCBI. We will also use `pandas` {cite}`pandasdevelopmentteam2023, McKinney2010` to store metadata in a dataframe.

In [2]:
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, Reference
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq, UndefinedSequenceError
from Bio.Entrez.Parser import DictionaryElement
from Bio.Entrez import HTTPError

import os
import logging
import re

from pandas import DataFrame, notna, merge, concat, read_csv, to_datetime, read_excel
import numpy as np
from lat_lon_parser import parse
from datetime import date

logging.basicConfig(level=logging.DEBUG)

In [3]:
from Bio import Entrez
Entrez.email = "kaedeito@student.ubc.ca"
# This line sets the name of the tool that is making the queries
Entrez.tool = "download_16s.ipynb"

SEARCH_NEW = False
GRANULARITY = 5
logger = logging.getLogger("download_16s")

Create a folder to save our fasta and genbank file.

In [4]:
dir_path = os.path.realpath("..\\..\\datasets\\full_analysis")
os.makedirs(dir_path, exist_ok=True)

## Clean and wrangle tables

We need to clean the data to make it easier to work with.
We need to load the metadata into a tabular format to make it easier to find problems, or to find interesting patterns.

In [5]:
def clean_table(df: DataFrame, granularity:int):
  """
  Clean the dataframe by removing rows and cleaning string formatting.

  :param `df`: The dataframe to clean.
  :param `granularity`: The number of characters to group the ids by.
  """
  feat_df = df.copy()

  # Split the country column into two columns
  feat_df[['country','iso_source2']] = feat_df['country'].str.split(':',expand=True)
  # Trim whitespace from the isolation_source column
  feat_df['iso_source2'] = feat_df['iso_source2'].map(lambda x: x.strip() if isinstance(x, str) else x)

  # Group the ids by the first n characters
  feat_df['group'] = feat_df['id'].str[:granularity]

  # Fill the isolation_source column with the iso_source2 column if it is null
  feat_df['isolation_source'] = feat_df['isolation_source'].fillna(feat_df['iso_source2'])

  feat_df.loc[feat_df['group'] == 'KC668', 'country'] = 'Saudi Arabia'
  feat_df.loc[feat_df['group'] == 'KC668', 'isolation_source'] = 'Southern Red Sea'

  # Print all that do not have Endozoi
  # Clean the table by removing the rows that do not contain Endozoicomonas / approved other taxa

  not_endo_df = feat_df[notna(feat_df['organism']) & ~(feat_df['organism'].astype(str).str.contains('Endozoicomonas|uncultured bacterium', regex=True))]
  not_endo_df.to_csv(os.path.join(dir_path, 'not_endo.csv'), index=False)

  # feat_df = feat_df[notna(feat_df['organism']) & (feat_df['organism'].astype(str).str.contains('Endozoicomonas|uncultured bacterium', regex=True))]

  # Clean the table by removing the rows that do not contain 16S
  feat_df = feat_df[notna(feat_df['product']) & (feat_df['product'].astype(str).str.contains('16S'))]

  # Copy the colony info from the isolation_source column
  if 'colony' in feat_df.columns:
    feat_df['strain'] = feat_df['strain'].fillna(feat_df['colony'])

  if 'BioProject' not in feat_df.columns:
    feat_df['BioProject'] = np.nan

  # Split the db_xref column into multiple columns, and use the first column as column name
  feat_df_nuc_xref = feat_df[['id', 'db_xref']].copy()
  feat_df_nuc_xref[['xref','value']] = feat_df['db_xref'].str.split(':',expand=True)

  feat_df_nuc_xref = feat_df_nuc_xref.drop_duplicates(subset=['id']).reset_index(drop=True)
  feat_df_nuc_xref = feat_df_nuc_xref.pivot(index=['id'], columns='xref', values='value')
  feat_df = merge(feat_df, feat_df_nuc_xref, on="id")

  # replace every "Ken Ting" with "Kenting"
  feat_df['isolation_source'].replace('Ken Ting', 'Kenting', inplace=True)

  # remove the columns that are not needed
  feat_df.drop(columns=['db_xref'], inplace=True)

  # Reset the index
  feat_df.reset_index()

  return feat_df

In [6]:
def unique(to_combine: list[list[SeqRecord]], all_df: DataFrame):
    idlist = all_df['id'].to_list()
    # intilize a null list
    keep_list: list[SeqRecord] = []

    todo_list: list[SeqRecord] = []

    already_found_list: list[str] = []

    for todo in to_combine:
        todo_list.extend(todo)

    # traverse for all elements
    for x in todo_list:
        if x.id in already_found_list:
          continue
        else:
          already_found_list.append(x.id)
        # check if exists in unique_list or not
        if x.id in idlist:
            keep_list.append(x)

    return keep_list

In [7]:
def save_file(file_name, records, rec_type):
  """
  Save the records to the file.

  `file_name`: The name of the file to save to.

  `records`: The records to save.

  `rec_type`: The type of record to save.
  """
  file_path = os.path.join(dir_path, file_name)
  with open(file_path, "w") as out_handle:
      try:
        count = SeqIO.write(records, out_handle, rec_type)
        logger.debug(f"{rec_type} {count} Saved")
      except UndefinedSequenceError as e_seq:
        logger.error(e_seq)
        logger.error(f"Failed to write {rec_type} to file {file_name}")
      except Exception as e:
        logger.error(e)
        logger.error(f"Failed to write {rec_type} to file {file_name}")

In [8]:
def table_features(count: int, records: list[SeqRecord], rec_type: str, granularity=5, debug: bool = False):
  """
  1. Extracts the features (both source and rRNA)
  2. Based on cleaned up list of features, filter records for matching ids
  3. Save the records to a file
  """
  logger.debug(f"Found {count} {rec_type}, sequences: {len(records)}")
  qualifiers: list[dict[str, str]] = []

  save_records: list[SeqRecord] = []

  for record in records:
    feats: list[SeqFeature] = record.features

    [source] = filter(lambda f: f.type == "source", feats)
    if not source:
       raise Exception("No source feature")

    # Copy the qualifiers and add the id
    # merge the rRNA qualifiers to the same dict
    qual = source.qualifiers.copy()
    qual["id"] = record.id

    seq_desc = record.description.split(" 16S")
    qual["sequence_name"] = seq_desc[0]
    record.description = seq_desc[0]

    # Filter for the rRNA features
    rRNA_filtered = list(filter(lambda f: f.type == "rRNA", feats))
    if len(rRNA_filtered) == 0:
       logger.debug("No rRNA feature")
       continue
    elif 'contig' in record.annotations:
      logger.debug("Is a contig")
      continue
    else:
      for rRNA in rRNA_filtered:
        try:
          qual.update(rRNA.qualifiers)
        except Exception as e:
           logger.error(rRNA)
           logger.exception(e)

      reference_list: list[Reference] = record.annotations.get('references', [])
      # Get the pubmed id list from the references
      pubmed_ids = [ref.pubmed_id for ref in reference_list if ref.pubmed_id]
      qual['pubmed_ids'] = pubmed_ids

      # Split the dbxref into a dict
      for dbxref in record.dbxrefs:
        split = dbxref.split(":")
        if len(split) > 1:
          qual[split[0]] = split[1]

      # Since all feature values are a list of string, (usually of length 1), convert to string
      for key in qual:
        qual_list = qual[key]
        if isinstance(qual_list, list):
          qual[key] = "; ".join(qual_list)

      # Extract the colony info from the isolation_source
      qual_iso_source = qual.get('isolation_source', '')
      if "Coral-associated microbial aggregate" in qual_iso_source:
        # logger.debug(f"colony info found for {record.id}")
        iso_qual: str = qual['isolation_source']

        [res] = re.findall(r'(?<=\().+?(?=\))', iso_qual)

        qual['isolation_source'] = None
        qual['colony'] = res
      # If the isolation_source mentions the host, move to the info to host column instead
      elif 'coral' in qual_iso_source or 'sponge' in qual_iso_source:
        qual['host'] = qual_iso_source
        qual['isolation_source'] = None

      # clean up host info
      qual_host: str = qual.get('host', '')
      brk_results = re.findall(r"(?<=\().+?(?=\))", qual_host)
      if len(brk_results) > 0:
        # for cases like "<Species name> (gorgorian coral)", remove trailing text in the brackets
        if "coral" in brk_results[0]:
          qual['host'] = qual_host.split(" (")[0]
        # For cases like "<details> (Species name)", extract the species name
        else:
          qual['host'] = brk_results[0]
      if qual.get('host') == None:
        qual['host'] = np.nan
      elif qual.get('host') == '':
        qual['host'] = np.nan
      elif qual.get('host') and qual.get('host') != np.nan:
        qual['host'] = qual['host'].strip().capitalize()
        if qual['host'] == 'Sea coral':
          qual['host'] = 'Coral'

      # As a safe guard, check that the sequence is not empty
      if record.seq:
        sequence: Seq = record.seq
        qual["sequence_length"] = len(sequence)
        save_records.append(record)

      qualifiers.append(qual)

  # Load features into a dataframe
  df1 = DataFrame(qualifiers)
  # Clean the table by replacing "None" and "nan"
  df1.replace("None", value=np.nan, inplace=True)
  df1.replace(np.NaN, value=np.nan, inplace=True)
  df1.replace("nan", value=np.nan, regex=True, inplace=True)

  # If not in debug mode, clean the table + save the files
  if debug:
    logger.debug(df1.head())
    return df1, save_records
  else:
    df2 = clean_table(df1, granularity)
    # Filter the records by the ids in the dataframe
    filtered_recs = list(filter((lambda x: x.id in df2['id'].to_list()), save_records))

    logger.info(f"Records to save: {len(filtered_recs)}/{len(save_records)}")

    # Save the records to a file
    save_file(f"{rec_type}_seq.gb", filtered_recs, "genbank")
    save_file(f"{rec_type}_seq.fasta", filtered_recs, "fasta")

    return df2, filtered_recs


## NCBI: Popset

### Search in NCBI (Popset)

As noted by the NCBI's website, their `Popset database` is a collection of related sequences that is sourced from a single population/phylogenetic/mutation/ecosystem study. 

This is a useful grouping, as we would be able to extract the locational data from the metadata and use it to create a map of the distribution of the bacteria.

In [9]:
def search_popset(additional_ids: list[str] = []):
  """
  Search for the popset database for the coral and Endozoicomonas.
  :return: The number of results and the list of ids.
  """
  try:
    term = f"((Endozoicomonas[Organism] OR Endozoicomonas[All Fields]) AND coral[All Fields]"
    logger.debug(term)
    handle = Entrez.esearch(db="popset",
      term=term,
      retmax=20,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      handle = Entrez.efetch(db="popset", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(str(http_e))
    return 0, [], []
  except Exception as e:
    logger.error(e)
    return 0, [], []

After review of literature that happened in the inspection by Nucleotide DB, I found some popset that were worth adding manually.

This includes the popset coming out of the work of {cite:p}`Speck_2012` (`"227461503"`), and the work of {cite:p}`Bayer_2013` (`"510829312"`).

In [10]:
additional_ids_pop = [
  '227461503',
  '510829312',
]

In [11]:
if SEARCH_NEW:
  count_pop, idlist_pop, records_pop = search_popset(additional_ids_pop)
else:


  records_pop = list(SeqIO.parse(os.path.join(dir_path, "popset_seq.gb"), "gb"))
  count_pop = len(records_pop)
  idlist_pop = [record.id for record in records_pop]

### Summary of popset

Metadata table created for the data out of Popset database was saved to [features_of_popset_seq.csv](../../datasets/full_analysis/features_of_popset_seq.csv).

Genbank file was saved as [popset_seq.gb](../../datasets/full_analysis/popset_seq.gb).
Fasta file was saved as [popset_seq.fasta](../../datasets/full_analysis/popset_seq.fasta).

In [12]:
# Load the metadata, and save the records to a file
feat_df_pop0, recs_pop = table_features(count_pop, records_pop, "popset", granularity=GRANULARITY, debug=False)

feat_df_pop0.to_csv(os.path.join(dir_path, "features_of_popset_seq.csv"), index=False)

feat_df_pop = feat_df_pop0.copy()
col_interest = ['id', 'sequence_name', 'group', 'organism', 'strain', 'host', 'country', 'taxon', 'BioProject', 'pubmed_ids', 'isolation_source', 'product']
feat_df_pop = feat_df_pop[col_interest]

DEBUG:download_16s:Found 2640 popset, sequences: 2640


INFO:download_16s:Records to save: 2640/2640
DEBUG:download_16s:genbank 2640 Saved
DEBUG:download_16s:fasta 2640 Saved


### Geographic details

In [13]:
feat_df_pop.groupby(['country', 'isolation_source','host'], dropna=False)['id'].count()

country       isolation_source          host                  
Bahamas       Tuna Alley (inside reef)  Plexaura sp.                1
France        NaN                       Corallium rubrum            1
Malaysia      NaN                       Coral reef ecosystems       5
Saudi Arabia  Southern Red Sea          Acropora humilis          322
                                        Pocillopora damicornis    496
                                        Stylophora pistillata     354
Taiwan        Green Island              Stylophora pistillata      17
              Kenting                   Stylophora pistillata      21
              Yeh Liu                   Stylophora pistillata       1
USA           Hillsboro Ledge           Eunicea fusca               1
              Summerland Key, Florida   Plexaura homomalla         25
              West Maui                 Porites compressa         758
NaN           NaN                       Coral                      58
                           

## NCBI: Nucleotide databse

While **+1500** results is absolute enough to work with, I will expand the scope to include more sequences by searching directly through the nucleotide database.

I will combine the popset dataset and the findings below, and then remove duplicates.

### Search in NCBI (Nucleotide)

The review of literature revealed one nucleotide sequence that was not included by the search using `search_nucleo(...)` function, which found a match to a coral species found in the Red Sea {cite}`Pogoreutz2022`. We will manually note this, and add it to our list of sequences to download. 


Interestingly, looking more into the referenced sequence revealed that the original study had a total of 412 16S Endozoicomonas sequences to work with {cite}`Bayer_2013`. This will be added by adding the accession numbers by Popset (above).

Additionally, they {cite}`Bayer_2013` also provided a supplementary phylogenetic tree. We will use this to add more accession ids to our list of sequences to download.

```{figure} ../../outputs/full_analysis/bayer_2013_fig_s3_1pg_rotated.png
---
name: parsimony-tree-bayer-2013
---
Tree showing the phylogenetic relationship of Endozoicomonas to other bacterial species.
```

These are the accession IDs mentioned in the tree above.
```{csv-table}
   :file: ../../datasets/full_analysis/extra_accessions.csv
   :header-rows: 1
```

> `DQ884169  (C19)` `DQ884170 (C23)`, `DQ884160 (CRNA5)` noted with "colony sequences C19, C23 and CRNA5 [...] formed a group related to Endozoiciomonas elysicola (97.6–99.7% similarity), a bacterium isolated from the marine mollusc Elysia ornata (M. Kurahashi, unpublished) {cite}`MartinezGarcia2006`"

In [14]:
def search_nucleo(limit: int, additional_ids: list[str] = []):
  """
  Search for the nucleotide database for nucleotides that mention coral and Endozoicomonas.

  :param `limit`: The maximum number of results to return.
  """

  term = "(coral[All Fields] AND 16S[Title]) AND Endozoicomonas[Organism]"
  try:
    handle = Entrez.esearch(db="nucleotide",
      term=term,
      retmax=limit,
    )
    res = Entrez.read(handle)
    if isinstance(res, DictionaryElement):
      ids: list[str] = res["IdList"]
      ids.extend(additional_ids)
      logger.debug(ids)
      handle = Entrez.efetch(db="nucleotide", id=ids, rettype="gb")
      records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
      return len(records), ids, records
    else:
      return 0, [], []
  except HTTPError as http_e:
    logger.error(http_e.read())
    logger.exception(http_e)
    return 0, [], []
  except Exception as e:
    logger.error(e)
    logger.exception(e)
    return 0, [], []

In [15]:

more_ids = read_csv(os.path.realpath("../../datasets/full_analysis/extra_accessions.csv"), header=0)
remove_ids_df = read_csv(os.path.realpath("../../datasets/full_analysis/excluded_accessions.csv"))
remove_ids = remove_ids_df['Accession ID'].to_list()

additional_accessions = more_ids['Accession ID'].to_list()
SEARCH_NEW = True
if SEARCH_NEW:
  count_nuc, idlist_nuc, records_nuc = search_nucleo(4000, additional_accessions)
else:
  records_nuc: list[SeqRecord] = list(SeqIO.parse(os.path.join(dir_path, "nucleotide_seq.gb"), "gb"))

records_nuc = list(filter(lambda x: x.id not in remove_ids, records_nuc))
count_nuc = len(records_nuc)
idlist_nuc = [record.id for record in records_pop]

DEBUG:download_16s:['2178380722', '2178380718', '2168685149', '2168685148', '2168685147', '2168685146', '2168685145', '2168685144', '2168685143', '2168685142', '2168685141', '2168685140', '2168685139', '2168685138', '2168685137', '2168685136', '2168685135', '2168685134', '2168685133', '2168685132', '2168685131', '2168685130', '2168685129', '2168685128', '2168685127', '2168685126', '2168685125', '2168685124', '2168685123', '2168685122', '2168685121', '2168685120', '2168685119', '2168685118', '2168685117', '2168685116', '2168685115', '2168685114', '2168685113', '2168685112', '2168685111', '2168685110', '2168685109', '2168685108', '2168685107', '2168685106', '2168685105', '2168685104', '2168685103', '2168685102', '2168685101', '2168685100', '2168685099', '2168685098', '2168685097', '2168685096', '2168685095', '2168685094', '2168685093', '2168685092', '2168685091', '2168685090', '2168685089', '2168685088', '2168685087', '2168685086', '2168685085', '2168685084', '2168685083', '1967466870', 

### Summary of nucleotide database

We will save the...
| Type of data | File name |
|--------------|-----------|
| Metadata | [features_of_nucleotide_seq.csv](../../datasets/full_analysis/features_of_nucleotide_seq.csv) |
| Genbank | [nucleotide_seq.gb](../../datasets/full_analysis/nucleotide_seq.gb) |
| Fasta | [nucleotide_seq.fasta](../../datasets/full_analysis/nucleotide_seq.fasta) |

## Everything combined (nucleotide + popset)

We will save the...
| Type of data | File name |
|--------------|-----------|
| Metadata | [feat_all_seq.csv](../../datasets/full_analysis/feat_all_seq.csv) |
| Genbank | [all_seq.gb](../../datasets/full_analysis/all_seq.gb) |
| Fasta | [all_seq.fasta](../../datasets/full_analysis/all_seq.fasta) |

In [16]:

def load_latlng():
  latlng_file = read_excel(os.path.join(dir_path, "../../datasets/full_analysis/handwritten_latlng.xlsx"), sheet_name="Sheet1")
  latlng_file['country'] = latlng_file['country'].astype(str)
  # switch every "nan" to np.nan
  latlng_file.replace("nan", value=np.nan, inplace=True)
  latlng_file[['lat','long']] = latlng_file['latlng'].str.split(',',expand=True)
  latlng_file['lat'].str.strip()
  latlng_file['long'].str.strip()
  latlng_file['lat_dec'] = latlng_file.apply(lambda row: parse(row['lat']), axis=1)
  latlng_file['long_dec'] = latlng_file.apply(lambda row: parse(row['long']), axis=1)
  return latlng_file


In [17]:
# Read excel file
latlng_file = load_latlng()

def annotate_groups(df0: DataFrame):
  df = df0.copy()
  # First, saudi arabia
  # Fix group KC669

  is_kc669_668 = df['id'].str.contains('KC669') | df['id'].str.contains('KC668')
  df.loc[is_kc669_668, 'country'] = 'Saudi Arabia'
  is_acropora_h = (df['country'] == 'Saudi Arabia') & (df['host'] == 'Acropora humilis')
  is_pocillopora_d = (df['country'] == 'Saudi Arabia') & (df['host'] == 'Pocillopora damicornis')

  df.loc[is_acropora_h, 'latlng'] = "19°52'24.48'N, 40°04'46.14'E"
  df.loc[is_acropora_h, 'isolation_source'] = "Red Sea, Brown reef"


  df.loc[is_pocillopora_d, 'latlng'] = "19°52'24.48'N, 40°04'46.14'E"
  df.loc[is_pocillopora_d, 'isolation_source'] = "Red Sea, Brown reef"
  df.loc[is_pocillopora_d | is_acropora_h, 'collection_date'] = date(2010, 7, 1)

  is_stylopora = df['host'] == 'Stylophora pistillata'

  is_site_5 = df['clone'].str.contains('05')
  is_site_12 = df['clone'].str.contains('12')
  is_site_14 = df['clone'].str.contains('14')
  is_site_15 = df['clone'].str.contains('15')
  is_site_17 = df['clone'].str.contains('17')

  df.loc[is_kc669_668 | is_stylopora, 'collection_date'] = date(2009, 6, 1)
  df.loc[is_kc669_668 & is_stylopora & is_site_5, 'latlng'] = "18°40'30.36'N, 40°44'21.18'E"
  df.loc[is_kc669_668 & is_stylopora & is_site_12, 'latlng'] = "18°40'30.36'N, 40°44'21.18'E"
  df.loc[is_kc669_668 & is_stylopora & is_site_14, 'latlng'] = "19°53'52.74'N, 40° 0'53.46'E"
  df.loc[is_kc669_668 & is_stylopora & is_site_15, 'latlng'] = "19°53'15.42'N, 40°9'23.94'E"
  df.loc[is_kc669_668 & is_stylopora & is_site_17, 'latlng'] = "20°8'58.38'N, 40°14'7.50'E"
  df.loc[is_kc669_668 & is_stylopora, 'isolation_source'] = "Southern Red Sea"
  df.replace(r"nan",np.nan, regex=True, inplace=True)
  # set lat and long by splitting the latlng column in saudi_group
  latlng_saudi = df.loc[df['country'] == 'Saudi Arabia']['latlng'].str.split(',', n=2, expand=True)
  df.loc[(df['country'] == 'Saudi Arabia'), 'lat'] = latlng_saudi[0]
  df.loc[(df['country'] == 'Saudi Arabia'), 'long'] = latlng_saudi[1]
  df.loc[(df['country'] == 'Saudi Arabia'), 'lat_dec'] = df.loc[(df['country'] == 'Saudi Arabia')].apply(lambda row: parse(row['lat']), axis=1)
  df.loc[(df['country'] == 'Saudi Arabia'), 'long_dec'] = df.loc[(df['country'] == 'Saudi Arabia')].apply(lambda row: parse(row['long']), axis=1)

  # remove extra maui
  is_Maui = df['isolation_source'] == 'West Maui'
  has_c7 = df['clone'] == 'C7-A01c'
  keep_df = df.loc[is_Maui & has_c7].copy()
  df = df.loc[~is_Maui]
  df = concat([df, keep_df])

  # deal with australia
  df.loc[df['id'].str.contains('KM3604'), 'country'] = 'Australia'
  df.loc[df['id'].str.contains('KM3604'), 'isolation_source'] = 'Orpheus Island Great Barrier Reef'

  # deal with taiwan
  is_taiwan_group = df['id'].str.contains('JN635')
  is_green_island = df['isolation_source'] == 'Green Island'
  # Split the clone column
  df['clone_num'] = df.loc[is_taiwan_group & is_green_island]['clone'].str.split('U', expand=True)[1].astype(int)
  # if the clone num is less than 1000, it's a Kunguan group
  is_kunguan_group = df['clone_num'] < 1000
  # otherwise it's a Chaikou, Green Island
  is_chaikou_group = df['clone_num'] >= 1000
  df.loc[is_taiwan_group & is_green_island & is_kunguan_group, 'isolation_source'] = "Kunguan, Green Island"
  df.loc[is_taiwan_group & is_green_island & is_chaikou_group, 'isolation_source'] = "Chaikou, Green Island"

  # Remove some of the extra taiwan
  to_remove = ['NR_116609.1', 'NR_158127.1', 'NR_169415.1']
  df = df.loc[~df['id'].isin(to_remove)]

  # if the host is any of "Coral", or "Coral mucus" or "Marine sponge" or "Coral reef ecosystems" set to nan
  df.loc[df['host'].isin(['Coral', 'Coral mucus', 'Marine sponge', 'Coral reef ecosystems']), 'host'] = np.nan

   # replace "gut" in isolation_source with np.nan
  df['isolation_source'].replace('gut', np.nan, inplace=True)

  # set isolation_source to np.nan where id = "FJ202634"
  df.loc[df['id'] == 'FJ202634.1', 'isolation_source'] = np.nan

  for index, row in latlng_file.iterrows():
    group = row['group']

    id_is_group = df['id'].str.contains(group)
    df.loc[id_is_group, 'doi'] = row['doi']


    if isinstance(row['country'], str):
      df.loc[id_is_group & (df['country'].isna()), 'country'] = row['country']

    if isinstance(row['host'], str):
      df.loc[id_is_group & (df['host'].isna()), 'host'] = row['host']

    if group != 'JN635':
      df.loc[id_is_group, 'collection_date'] = row['date']
      df.loc[id_is_group, 'latlng'] = row['latlng']
      df.loc[id_is_group, 'lat_dec'] = row['lat_dec']
      df.loc[id_is_group, 'long_dec'] = row['long_dec']

      if isinstance(row['source'], str) and row['source'] != '':
        df.loc[id_is_group & (df['isolation_source'].isna()), 'isolation_source'] = row['source']

    else:
      find_matching_taiwan = id_is_group & (df['isolation_source'] == row['source'])
      df.loc[find_matching_taiwan, 'collection_date'] = row['date']
      df.loc[find_matching_taiwan, 'latlng'] = row['latlng']

      df.loc[id_is_group, 'lat_dec'] = row['lat_dec']
      df.loc[id_is_group, 'long_dec'] = row['long_dec']

  df['collection_date'] = to_datetime(df['collection_date'])

  return df


In [18]:
feat_df_nuc0, recs_nuc = table_features(count_nuc, records_nuc, "nucleotide", granularity=GRANULARITY, debug=False)
feat_df_nuc0.to_csv(os.path.join(dir_path, "features_of_nucleotide_seq.csv"), index=False)

DEBUG:download_16s:Found 187 nucleotide, sequences: 187
INFO:download_16s:Records to save: 187/187
DEBUG:download_16s:genbank 187 Saved
DEBUG:download_16s:fasta 187 Saved


In [19]:
feat_df_all = concat([feat_df_nuc0, feat_df_pop0]).drop_duplicates(subset="id")
# print rows with country = Brazil


logger.info(f"Records found: {len(feat_df_all['id'].to_list())}")
feat_df_all = annotate_groups(feat_df_all)

logger.info(f"Records found: {len(feat_df_all['id'].to_list())}")

col_interest = ['id', 'country', 'group', 'host', 'isolation_source', 'sequence_length', 'sequence_name', 'collection_date', 'latlng', 'lat_dec', 'long_dec']
# ['id', 'clone', 'sequence_name', 'group', 'organism', 'host', 'country', 'pubmed_ids', 'isolation_source','sequence_length']
feat_df_all = feat_df_all[col_interest].sort_values(by=['sequence_length'], ascending=False).sort_values(by=['id']).reset_index(drop=True)
feat_df_all.to_csv(os.path.join(dir_path, "feat_all_seq.csv"), index=False)

INFO:download_16s:Records found: 1883


INFO:download_16s:Records found: 1123


In [20]:
# find nan collection_date
feat_df_all.loc[feat_df_all['collection_date'].isna()]

Unnamed: 0,id,country,group,host,isolation_source,sequence_length,sequence_name,collection_date,latlng,lat_dec,long_dec


In [21]:
# find nan lat_dec
feat_df_all.loc[feat_df_all['lat_dec'].isna()]

Unnamed: 0,id,country,group,host,isolation_source,sequence_length,sequence_name,collection_date,latlng,lat_dec,long_dec


In [22]:
# find nan long_dec
feat_df_all.loc[feat_df_all['long_dec'].isna()]

Unnamed: 0,id,country,group,host,isolation_source,sequence_length,sequence_name,collection_date,latlng,lat_dec,long_dec


In [23]:
unique_recs = unique([recs_nuc, recs_pop], feat_df_all)
logger.info(f"Unique records found: {len(unique_recs)} / {len(feat_df_all['id'].to_list())}")
# Save the records to a file
save_file("all_seq.gb", unique_recs, "genbank")
save_file("all_seq.fasta", unique_recs, "fasta")
print(feat_df_all.columns)


INFO:download_16s:Unique records found: 1123 / 1123


DEBUG:download_16s:genbank 1123 Saved
DEBUG:download_16s:fasta 1123 Saved


Index(['id', 'country', 'group', 'host', 'isolation_source', 'sequence_length',
       'sequence_name', 'collection_date', 'latlng', 'lat_dec', 'long_dec'],
      dtype='object')


### Geographic details

Here shows the breakdown by country, and by isolation source as reported by the metadata.

In [24]:
country_source = feat_df_all.groupby(['country', 'isolation_source'], dropna=False).head(2)
country_source[['country', 'isolation_source', 'id']].to_csv(os.path.join(dir_path, "country_source.csv"), index=True)
country_source['id'].count()

56

In [25]:
grouping = ['country', 'group', 'host', 'isolation_source']
feat_df_all.groupby(grouping, dropna=False)['id'].count()

country         group  host                               isolation_source                                                    
Atlantic Ocean  FM162  Bathymodiolus puteoserpentis       Mid-Atlantic Ridge, Logatchev hydrothermal vent field, IRINA II site      1
Australia       AY700  Pocillopora damicornis             Great Barrier Reef                                                        2
                GU784  Ianthella basta                    Masig Island, central Torres Strait                                       1
                KM360  Lobophytum compactum               Orpheus Island Great Barrier Reef                                         3
Bahamas         JX488  Plexaura sp.                       Tuna Alley (inside reef)                                                  1
Brazil          GU183  Mucus of apparently healthy coral  Preta Beach, Sao Sebastiao Channel, Sao Paulo                             1
                KJ372  Arenosclera brasiliensis           João Fernan

We must load the outgroup sequences.

In [60]:
# load from other_taxa_16S\\16S_other_taxa_all.fasta
# list files that end with .fna in  other_taxa_16S
list_fna = [f for f in os.listdir(os.path.join(dir_path, "other_taxa_16S")) if f.endswith('.fna')]
# load the fasta files
other_taxa : list[SeqRecord] = []
for fna in list_fna:
  other_taxa.extend(list(SeqIO.parse(os.path.join(dir_path, "other_taxa_16S", fna), "fasta")))

# load the metadata from other_taxa_16S\\other_taxa.csv
other_taxa_df = read_csv(os.path.join(dir_path, "other_taxa_16S\\other_taxa.csv"))
# split latlng into lat and long
other_taxa_df[['lat','long']] = other_taxa_df['latlng'].str.split(' ',expand=True)
# read collection_date as date
other_taxa_df['collection_date'] = to_datetime(other_taxa_df['collection_date'])
# parse lat and long into decimal
other_taxa_df['lat_dec'] = other_taxa_df.apply(lambda row: parse(row['lat']), axis=1)
other_taxa_df['long_dec'] = other_taxa_df.apply(lambda row: parse(row['long']), axis=1)

other_taxa_df = other_taxa_df.drop(columns=['lat', 'long'])

In [61]:
other_taxa_df.head()

Unnamed: 0,id,doi,collection_date,latlng,lat_dec,long_dec
0,NZ_JAEVHF010000042.1:129-1670,10.1099/ijsem.0.005055,2007-04-01,"37°24'06.0""N 130°13'36.4""E",37.401667,130.226778
1,NZ_PZJW01000013.1:348-1887,10.1099/ijsem.0.003061,2010-11-11,"40°39'32""N 8°48'40""W",40.658889,-8.811111
2,NZ_FWPT01000026.1:38-1347,10.1016/j.syapm.2017.11.004,2017-03-24,"50°34'08""N 8°40'22""E",50.568889,8.672778
3,NZ_SMME01000314.1:1-536,10.1099/ijsem.0.002781,2018-07-09,24.0776°N 74.476°W,24.0776,-74.476
4,NZ_QWBW01000001.1:22479-24016,10.1007/s00284-019-01674-z,2015-07-01,"17°57'48""S 38°41'30""Z",-17.963333,38.691667


In [56]:
def save_tip_dates(df0: DataFrame, file_name: str):
  df = df0.drop(['latlng'], axis=1)
  df['collection_date_str'] = df['collection_date'].dt.strftime('%Y-%m-%d')
  df[['id', 'collection_date_str']].to_csv(os.path.join(dir_path, "../../outputs/full_analysis", file_name), index=False, sep="\t", header=False)

In [42]:
def save_trait_values(df0: DataFrame, file_name: str):
  df = df0.drop(['latlng'], axis=1)
  df['collection_date_str'] = df['collection_date'].dt.strftime('%Y-%m-%d').astype(str)
  df.rename(columns={'lat_dec': 'lat', 'long_dec': 'long'}, inplace=True)

  print(len(df))
  df[['id', 'lat', 'long']].to_csv(os.path.join(dir_path, "../../outputs/full_analysis", file_name), index=False, sep="\t")

In [64]:
to_align_df = feat_df_all.groupby(grouping, dropna=False).head(5)

to_align = [rec for rec in unique_recs if rec.id in to_align_df['id'].to_list()]
# concatenate other_taxa and to_align
all_with_other = to_align + other_taxa

all_with_other_df = concat([to_align_df, other_taxa_df])
print(len(all_with_other_df))

117


In [65]:
save_trait_values(all_with_other_df, f"{len(all_with_other_df)}_trait_values.txt")
save_tip_dates(all_with_other_df, f"{len(all_with_other_df)}_tip_dates.txt")
# save_file(f"{len(all_with_other)}_to_align.fasta", all_with_other, "fasta")

117


## Align the 16S sequences

Use muscle to align the sequences.

```bash
muscle5.1 -align ./datasets/full_analysis/117_to_align.fasta -output ./outputs/full_analysis/muscle/aligned_top_117.fasta
```

Rename the descrption in the fasta file to make using the file in BEAST easier.

In [31]:
# Read aligned fasta file
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

alignment = AlignIO.read(os.path.realpath(f"..\\..\\outputs\\full_analysis\\muscle\\aligned_top_{len(all_with_other)}.fasta"), "fasta")
if isinstance(alignment, MultipleSeqAlignment):
  print(f"Alignment length {alignment.get_alignment_length()}")
  print(f"Number of records {len(alignment)}")
  for seq in alignment:
    seq.description = seq.id

  # Save the alignment to a file
  AlignIO.write(alignment, os.path.realpath(f"../../outputs/full_analysis/muscle/{len(all_with_other)}_aligned_name_replaced.fasta"), "fasta")


Alignment length 1713
Number of records 96
