# North American 2022 EV-D68 outbreak analyses

In [15]:
# import packages
import scrubbing
import pandas as pd
import datetime_funcs

## EvD68 analyses of Only PHO samples for WGS, VP1 and VP1's Hyper-Variable Region

* Remove non-PHO sequences from maffta aligned WGS NCBI and PHO aligned fasta file.

In [None]:
scrubbing.keep_if_tag_startswith('WGS_analysis/WGS_aligned.fasta', 'PHO_only_analysis/PHO_WGS_alignment.fasta', 'PHL')

* Producing metadata.

In [2]:
metadata = pd.read_csv('WGS_analysis/WGS_metadata_updated.csv')
metadata['date'] = pd.to_datetime(metadata['date'])


In [11]:
# Filter to PHOL 2022 only samples
metadata = metadata[metadata.Outbreak=='Canada 7-10/2022']
print(str(metadata['date'].min())  + ' to ' + str(metadata['date'].max()))

2022-08-31 00:00:00 to 2022-10-06 00:00:00


In [None]:
metadata.name = metadata.Accession
metadata['name']= metadata['Accession']
metadata_TempEst = metadata[['Accession','date']]



In [None]:
metadata_TempEst.to_csv('PHO_only_analysis/PHO_metadata.tsv', index=False, sep='\t')
metadata[['name','date']].to_csv('PHO_only_analysis/PHO_metadata.csv', index=False)


### Creating a data set of only PHO sequences with younger duplicate sequences

In [None]:
ids_of_duplicated_sequences = scrubbing.get_ids_of_duplicated_sequences('PHO_only_analysis/PHO_WGS_alignment.fasta')
ids_of_duplicated_sequences

There are no duplicated sequences

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment PHO_only_analysis/PHO_WGS_alignment.fasta --output PHO_only_analysis/PHO_WGS_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment PHO_only_analysis/PHO_WGS_alignment.fasta\
    --tree PHO_only_analysis/PHO_WGS_tree.subs.nwk\
    --metadata PHO_only_analysis/PHO_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree PHO_only_analysis/PHO_WGS_tree.time.nwk\
    --output-node-data PHO_only_analysis/PHO_WGS_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2020.692    |
| Evolutionary rate | 3.378 e-3 subs/site/year |

## VP1 gene

* Remove non-PHO sequences from maffta aligned VP1 NCBI and PHO aligned fasta file.

In [None]:
scrubbing.keep_if_tag_startswith('VP1_analysis/VP1_alignment.fasta', 'PHO_only_analysis/PHO_VP1_alignment.fasta', 'PHL')

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment PHO_only_analysis/PHO_VP1_alignment.fasta --output PHO_only_analysis/PHO_VP1_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment PHO_only_analysis/PHO_VP1_alignment.fasta\
    --tree PHO_only_analysis/PHO_VP1_tree.subs.nwk\
    --metadata PHO_only_analysis/PHO_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree PHO_only_analysis/PHO_VP1_tree.time.nwk\
    --output-node-data PHO_only_analysis/PHO_VP1_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.3032    |
| Evolutionary rate | 1.5755e-2 sub/site/year |

## VP1 gene Hyper-Variable Region (H-VR)

Venkata has kindly trimmed the VP1 aligned file further to this region. We need to select the 

In [None]:
scrubbing.keep_if_tag_startswith('VP1_HVR_analysis/VP1_HVR_alignment.fasta', 'PHO_only_analysis/PHO_VP1_H-VR_alignment.fasta', 'PHL')

* The VP1 metadata files from before are available. Therefore, only building tree and refining it are needed.

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment PHO_only_analysis/PHO_VP1_H-VR_alignment.fasta --output PHO_only_analysis/PHO_VP1_H-VR_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment PHO_only_analysis/PHO_VP1_H-VR_alignment.fasta\
    --tree PHO_only_analysis/PHO_VP1_H-VR_tree.subs.nwk\
    --metadata PHO_only_analysis/PHO_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree PHO_only_analysis/PHO_VP1_H-VR_tree.time.nwk\
    --output-node-data PHO_only_analysis/PHO_VP1_H-VR_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.6409    |
| Evolutionary rate | 5.1533 e-3 subs/site/year  |

## EvD68 analyses of Only JHL 2022 samples for WGS, VP1 and VP1's Hyper-Variable Region

## Selecting JHL 2022 sequences

* JHL 2022 metadate


In [16]:
metadata = pd.read_csv('WGS_analysis/WGS_metadata_updated.csv')
metadata['date'] = pd.to_datetime(metadata['date'])

In [17]:
len(metadata)

1134

In [18]:
# Filter to JHL 2022 only samples
metadata = metadata[metadata.Outbreak=='USA 7-9/2022']
print(str(metadata['date'].min())  + ' to ' + str(metadata['date'].max()))


2022-07-08 00:00:00 to 2022-09-12 00:00:00


In [19]:
datetime_funcs.date_to_year_fraction(metadata['date'].max())

2022.6957762557079

In [None]:
metadata.name = metadata.Accession
metadata_TempEst = metadata[['Accession','date']]
metadata_TempEst.to_csv('JHL_only_analysis/JHL_metadata.tsv', index=False, sep='\t')
metadata[['name','date']].to_csv('JHL_only_analysis/JHL_metadata.csv', index=False)


* Remove non-JHL sequences from maffta aligned WGS NCBI and JHL aligned fasta file.

In [None]:
accessions_to_keep = metadata.Accession.to_list()
scrubbing.remove_if_accession_not_in_list('WGS_analysis/WGS_aligned.fasta', 'JHL_only_analysis/JHL_WGS_alignment.fasta', accessions_to_keep)

### Creating a data set of only JHL sequences with younger duplicate sequences

In [None]:
ids_of_duplicated_sequences = scrubbing.get_ids_of_duplicated_sequences('JHL_only_analysis/JHL_WGS_alignment.fasta')
ids_of_duplicated_sequences

There are no duplicated sequences

## WGS Work


### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment JHL_only_analysis/JHL_WGS_alignment.fasta --output JHL_only_analysis/JHL_WGS_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment JHL_only_analysis/JHL_WGS_alignment.fasta\
    --tree JHL_only_analysis/JHL_WGS_tree.subs.nwk\
    --metadata JHL_only_analysis/JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree JHL_only_analysis/JHL_WGS_tree.time.nwk\
    --output-node-data JHL_only_analysis/JHL_WGS_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.2436    |
| Evolutionary rate | 2.0072 e-2 subs/site/year |

## VP1 gene

* Remove non-JHL sequences from maffta aligned VP1 NCBI and JHL aligned fasta file.

In [None]:
scrubbing.remove_if_accession_not_in_list('VP1_analysis/VP1_alignment.fasta', 'JHL_only_analysis/JHL_VP1_alignment.fasta', accessions_to_keep)

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment JHL_only_analysis/JHL_VP1_alignment.fasta --output JHL_only_analysis/JHL_VP1_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment JHL_only_analysis/JHL_VP1_alignment.fasta\
    --tree JHL_only_analysis/JHL_VP1_tree.subs.nwk\
    --metadata JHL_only_analysis/JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree JHL_only_analysis/JHL_VP1_tree.time.nwk\
    --output-node-data JHL_only_analysis/JHL_VP1_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.3367    |
| Evolutionary rate | 4.3955e-2 sub/site/year |

## VP1 gene Hyper-Variable Region (H-VR)

Venkata has kindly trimmed the VP1 aligned file further to this region. We need to select the 

In [None]:
scrubbing.remove_if_accession_not_in_list('VP1_HVR_analysis/VP1_HVR_alignment.fasta', 'JHL_only_analysis/JHL_VP1_H-VR_alignment.fasta', accessions_to_keep)

* The VP1 metadata files from before are available. Therefore, only building tree and refining it are needed.

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment JHL_only_analysis/JHL_VP1_H-VR_alignment.fasta --output JHL_only_analysis/JHL_VP1_H-VR_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment JHL_only_analysis/JHL_VP1_H-VR_alignment.fasta\
    --tree JHL_only_analysis/JHL_VP1_H-VR_tree.subs.nwk\
    --metadata JHL_only_analysis/JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree JHL_only_analysis/JHL_VP1_H-VR_tree.time.nwk\
    --output-node-data JHL_only_analysis/JHL_VP1_H-VR_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.1805    |
| Evolutionary rate | 3.86798 e-2 subs/site/year  |

# EvD68 analyses of PHO and JHL 2022 samples for WGS, VP1 and VP1's Hyper-Variable Region

## Selecting JHL and PHO 2022 sequences

* Load metadate


In [None]:
metadata = pd.read_csv('WGS_analysis/WGS_metadata_updated.csv')


In [None]:
metadata.Outbreak.value_counts()

In [None]:
# Filter to JHL 2022 only samples
metadata = metadata[metadata.Outbreak.isin(['USA 7-9/2022', 'Canada 7-10/2022'])]



In [None]:
metadata.name = metadata.Accession
metadata_TempEst = metadata[['Accession','date']]
metadata_TempEst.to_csv('North_America_2022_analysis/PHO_and_JHL_metadata.tsv', index=False, sep='\t')
metadata[['name','date']].to_csv('North_America_2022_analysis/PHO_and_JHL_metadata.csv', index=False)


* Selecting only PHO and JHL 2022 sequences from maffta aligned WGS NCBI and PHO aligned fasta file.

In [None]:
accessions_to_keep = metadata.Accession.to_list()
scrubbing.remove_if_accession_not_in_list('WGS_analysis/WGS_aligned.fasta', 'North_America_2022_analysis/PHO_and_JHL_WGS_alignment.fasta', accessions_to_keep)

### Crating a set with younger duplicated sequences removed.

In [None]:
ids_of_duplicated_sequences = scrubbing.get_ids_of_duplicated_sequences('North_America_2022_analysis/PHO_and_JHL_WGS_alignment.fasta')
ids_of_duplicated_sequences

There are no duplicated sequences.

## WGS Work


### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment North_America_2022_analysis/PHO_and_JHL_WGS_alignment.fasta --output North_America_2022_analysis/PHO_and_JHL_WGS_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment North_America_2022_analysis/PHO_and_JHL_WGS_alignment.fasta\
    --tree North_America_2022_analysis/PHO_and_JHL_WGS_tree.subs.nwk\
    --metadata North_America_2022_analysis/PHO_and_JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree North_America_2022_analysis/PHO_and_JHL_WGS_tree.time.nwk\
    --output-node-data North_America_2022_analysis/PHO_and_JHL_WGS_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2021.0939    |
| Evolutionary rate | 3.3251 e-3 subs/site/year |

## VP1 gene

* Remove non-JHL sequences from maffta aligned VP1 NCBI and JHL aligned fasta file.

In [None]:
scrubbing.remove_if_accession_not_in_list('VP1_analysis/VP1_alignment.fasta', 'North_America_2022_analysis/PHO_and_JHL_VP1_alignment.fasta', accessions_to_keep)

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment North_America_2022_analysis/PHO_and_JHL_VP1_alignment.fasta --output North_America_2022_analysis/PHO_and_JHL_VP1_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment North_America_2022_analysis/PHO_and_JHL_VP1_alignment.fasta\
    --tree North_America_2022_analysis/PHO_and_JHL_VP1_tree.subs.nwk\
    --metadata North_America_2022_analysis/PHO_and_JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree North_America_2022_analysis/PHO_and_JHL_VP1_tree.time.nwk\
    --output-node-data North_America_2022_analysis/PHO_and_JHL_VP1_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.0177    |
| Evolutionary rate | 1.1934e-2 sub/site/year |

## VP1 gene Hyper-Variable Region (H-VR)

Venkata has kindly trimmed the VP1 aligned file further to this region. We need to select the 

In [None]:
scrubbing.remove_if_accession_not_in_list('VP1_HVR_analysis/VP1_HVR_alignment.fasta', 'North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_alignment.fasta', accessions_to_keep)

* The VP1 metadata files from before are available. Therefore, only building tree and refining it are needed.

### Buiding tree

In [None]:
%%bash
augur tree --substitution-model auto --nthreads auto --alignment North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_alignment.fasta --output North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_tree.subs.nwk

### Refining Tree

* Refining Tree with regards to date. 

In [None]:
%%bash
augur refine --alignment North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_alignment.fasta\
    --tree North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_tree.subs.nwk\
    --metadata North_America_2022_analysis/PHO_and_JHL_metadata.csv\
    --timetree --divergence-units mutations\
    --output-tree North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_tree.time.nwk\
    --output-node-data North_America_2022_analysis/PHO_and_JHL_VP1_H-VR_tree.node.json --keep-root

### Talbe of TempEst analyses

| Statistic | Value |
|-----------|-------|
| Time since Most Recent Common Ancestor |   2022.6766    |
| Evolutionary rate | 1.4242 e-2 subs/site/year  |