Documentation Updates

HallemLab · Oct 13, 2020 · f1e71c0 · f1e71c0
1 parent 2025ea2
commit f1e71c0
Show file tree

Hide file tree

Showing 29 changed files with 4,224 additions and 4,679 deletions.
diff --git a/Static/ceusage.csv → Data/Ce_usage_counts.csv b/Static/ceusage.csv → Data/Ce_usage_counts.csv
diff --git a/Data/Sr_top50_usage_counts.csv b/Data/Sr_top50_usage_counts.csv
@@ -0,0 +1,62 @@
+AA,Codon,Count
+Ala,GCA,156
+Ala,GCC,165
+Ala,GCG,1
+Ala,GCT,609
+Cys,TGT,185
+Cys,TGC,60
+Asp,GAC,139
+Asp,GAT,450
+Glu,GAA,586
+Glu,GAG,100
+Phe,TTC,215
+Phe,TTT,197
+Gly,GGA,675
+Gly,GGC,23
+Gly,GGG,8
+Gly,GGT,358
+His,CAC,91
+His,CAT,127
+Ile,ATA,67
+Ile,ATC,171
+Ile,ATT,425
+Lys,AAA,660
+Lys,AAG,244
+Leu,CTA,7
+Leu,CTC,60
+Leu,CTG,3
+Leu,CTT,413
+Leu,TTA,123
+Leu,TTG,97
+Met,ATG,237
+Asn,AAC,157
+Asn,AAT,310
+Pro,CCA,680
+Pro,CCC,11
+Pro,CCG,3
+Pro,CCT,81
+Gln,CAA,391
+Gln,CAG,13
+Arg,AGA,218
+Arg,CGA,10
+Arg,AGG,10
+Arg,CGC,29
+Arg,CGG,1
+Arg,CGT,283
+Ser,AGC,31
+Ser,TCA,220
+Ser,TCC,55
+Ser,AGT,86
+Ser,TCG,13
+Ser,TCT,320
+Thr,ACA,237
+Thr,ACC,119
+Thr,ACG,7
+Thr,ACT,317
+Val,GTA,139
+Val,GTC,117
+Val,GTG,6
+Val,GTT,489
+Trp,TGG,98
+Tyr,TAC,136
+Tyr,TAT,217
diff --git a/Data/calculate_codon_usage_rules.R b/Data/calculate_codon_usage_rules.R
@@ -0,0 +1,52 @@
+# This script takes species-specific counts of codon occurances and calculates the frequency each codon "i" encodes amino acid "AA"
+# These values will be passed to the quantification of relative adaptiveness, the first step for calculating codon adaptation index.
+
+# Generate tibble with stop codon codes
+stop_cdns <- tibble(AA = factor("*","*","*"),
+                    Codon = c("TAA", "TAG", "TGA"),
+                    Frequency = c(0,0,0))
+
+# Load S. ratti count data
+# Source: Mitreva et al 2006; counts taken from 50 most common expressed sequence tag clusters (putative genes)
+Sr.dat <- read_csv('Sr_top50_usage_counts.csv',
+                   quote = "",
+                   col_types = 'fcd')
+
+Sr.codon.freq <- Sr.dat %>%
+    dplyr::mutate(AA = seqinr::a(AA)) %>%
+    dplyr::arrange(AA, Codon) %>%
+    dplyr::mutate(AA = factor(AA)) %>%
+    group_by(AA) %>%
+    dplyr::mutate (Frequency = Count / sum(Count)) %>%
+    dplyr::mutate (Frequency = Frequency *100) %>%
+    dplyr::mutate (Frequency = signif(Frequency, digits = 9)) %>%
+    dplyr::full_join(stop_cdns, by = c("AA", "Codon", "Frequency")) %>%
+    dplyr::rename("Sr_optimal" = "Frequency") %>%
+    dplyr::select(!Count)
+
+# Load C. elegans count data
+# Soruce: Sharp and Bradnam, 1997; https://www.ncbi.nlm.nih.gov/books/NBK20194/
+Ce.dat <- read_csv('Ce_usage_counts.csv', 
+                   quote = "", 
+                   col_types = 'ccd'
+)
+
+Ce.codon.freq <- Ce.dat %>%
+    dplyr::mutate(AA = seqinr::a(AA)) %>%
+    dplyr::arrange(AA, Codon) %>%
+    dplyr::mutate(AA = factor(AA)) %>%
+    group_by(AA) %>%
+    dplyr::mutate (Frequency = Count / sum(Count)) %>%
+    dplyr::mutate (Frequency = Frequency *100) %>%
+    dplyr::mutate (Frequency = signif(Frequency, digits = 9)) %>%
+    dplyr::full_join(stop_cdns, by = c("AA", "Codon", "Frequency")) %>%
+    dplyr::rename("Ce_optimal" = "Frequency") %>%
+    dplyr::select(!Count)
+
+codon_usage_chart <- dplyr::full_join(Sr.codon.freq, 
+                                      Ce.codon.freq,
+                                      by = c("AA", "Codon")
+)
+
+write_csv(codon_usage_chart,
+          path = "codon_usage_chart.csv")
diff --git a/Offline Analysis/Ce_chemoreceptors.csv b/Offline Analysis/Ce_chemoreceptors.csv
diff --git a/Offline Analysis/Ce_chemosensory_cDNA_list.csv b/Offline Analysis/Ce_chemosensory_cDNA_list.csv
diff --git a/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.Rmd b/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.Rmd
@@ -1,13 +1,13 @@
 ---
 title: "Codon Usage Chemoreceptor Adaptiveness"
-date: "10/2/2020"
+date: "10/7/2020"
 output:
-  html_document:
-    code_folding: hide
+  pdf_document:
     df_print: paged
-    toc: yes
-  html_notebook:
-    toc: yes
+    toc: true
+    toc_depth: 3
+    number_sections: true
+
 ---
 
 # Introduction  
@@ -19,7 +19,7 @@ Full lists of CDS sequences for *S. stercoralis*, *S. ratti*, *S. papillosus*, *
 
 For user-defined genes-of-interest, a list of *S. stercoralis* chemoreceptor genes was used as input to the *Strongyloides* Codon Adapter App; the generated excel report is uploaded below.  
 
-```{r setup, include=FALSE}
+```{r setup, echo=FALSE}
 suppressPackageStartupMessages({
   library(knitr)
   library(rmarkdown)
@@ -31,22 +31,23 @@ suppressPackageStartupMessages({
   library(biomaRt)
   library(ggplot2)
 })
-knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
 ```
 
 ## Parse Species-specific lists of chemoreceptors 
 Cleaning and data wrangling for chemoreceptor gene lists; necessary prepreocessing step before running lists of genes through the *Strongyloides* Codon Adapter App. Gene lists were downloaded from a Shiny app provided for this purpose by [Wheeler *et al* 2020](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000723). Species for which we have lists of chemoreceptors are: *C. elegans*, *S. stercoralis*, *S. ratti*, and *S. venezuelensis*. Data are saved as 2-column .csv files containing geneIDs and cDNA sequences; these files can be used as inputs to the *Strongyloides* Codon Adapter App.     
 
 ```{r cleanGeneLists, eval = F}
 # C. elegans ----
-temp <- c(Ce = '../Data/Ce_chemoreceptors.csv')
+temp <- c(Ce = 'Data/Ce_chemoreceptors.csv')
 genelist.Ce <- suppressWarnings(read.csv(temp, 
                                          header = TRUE, 
                                          colClasses = "character", 
                                          strip.white = T)) %>%
   as_tibble() 
 
-## For C. elegans, match wormbase gene sequence name to the wbsp transcript id, and pull the cDNA sequence
+## For C. elegans, match wormbase gene sequence name to the wbsp transcript id, 
+## and pull the cDNA sequence
 Ce.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
                    # grab the cDNA sequences for the given genes from WormBase Parasite
                    mart = useMart(biomart="parasite_mart",
@@ -63,18 +64,23 @@ Ce.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
   dplyr::rename(geneID = wbps_transcript_id, cDNA = cdna)
 Ce.tr.seq$cDNA <- tolower(Ce.tr.seq$cDNA)
 
-write.table(Ce.tr.seq, file = "./Ce_chemosensory_cDNA_list.csv", sep = ",", col.names = FALSE, row.names = FALSE)
+write.table(Ce.tr.seq, 
+            file = "./Ce_chemosensory_cDNA_list.csv", 
+            sep = ",", 
+            col.names = FALSE, 
+            row.names = FALSE)
 
 # S. ratti ----
-temp <- c(Sr = '../Data/Sr_chemoreceptors.csv')
+temp <- c(Sr = 'Data/Sr_chemoreceptors.csv')
 
 genelist.Sr <- suppressWarnings(read.csv(temp, 
                                          header = TRUE, 
                                          colClasses = "character", 
                                          strip.white = T)) %>%
   as_tibble() 
 
-## For ratti, match wormbase gene sequence name to the wbsp transcript id, and pull the cDNA sequence
+## For ratti, match wormbase gene sequence name to the wbsp transcript id, 
+## and pull the cDNA sequence
 Sr.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
                    # grab the cDNA sequences for the given genes from WormBase Parasite
                    mart = useMart(biomart="parasite_mart",
@@ -91,18 +97,23 @@ Sr.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
   dplyr::rename(geneID = wbps_transcript_id, cDNA = cdna)
 Sr.tr.seq$cDNA <- tolower(Sr.tr.seq$cDNA)
 
-write.table(Sr.tr.seq, file = "./Sr_chemosensory_cDNA_list.csv", sep = ",", col.names = FALSE, row.names = FALSE)
+write.table(Sr.tr.seq, 
+            file = "./Sr_chemosensory_cDNA_list.csv", 
+            sep = ",", 
+            col.names = FALSE, 
+            row.names = FALSE)
 
 # S. venezuelensis ----
-temp <- c(Sv = '../Data/Sv_chemoreceptors.csv')
+temp <- c(Sv = 'Data/Sv_chemoreceptors.csv')
 
 genelist.Sv <- suppressWarnings(read.csv(temp, 
                                          header = TRUE, 
                                          colClasses = "character", 
                                          strip.white = T)) %>%
   as_tibble() 
 
-## For ratti, match wormbase gene sequence name to the wbsp transcript id, and pull the cDNA sequence
+## For venezuelnsis, match wormbase gene sequence name to the wbsp transcript id,
+##  and pull the cDNA sequence
 Sv.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
                    # grab the cDNA sequences for the given genes from WormBase Parasite
                    mart = useMart(biomart="parasite_mart",
@@ -119,7 +130,11 @@ Sv.tr.seq <- getBM(attributes=c('wbps_transcript_id', 'cdna'),
   dplyr::rename(geneID = wbps_transcript_id, cDNA = cdna)
 Sv.tr.seq$cDNA <- tolower(Sv.tr.seq$cDNA)
 
-write.table(Sv.tr.seq, file = "./Sv_chemosensory_cDNA_list.csv", sep = ",", col.names = FALSE, row.names = FALSE)
+write.table(Sv.tr.seq, 
+            file = "./Sv_chemosensory_cDNA_list.csv", 
+            sep = ",", 
+            col.names = FALSE, 
+            row.names = FALSE)
 ```
 
 
@@ -286,7 +301,10 @@ cai_plot <- ggplot(tbl, aes(Sr_CAI, Ce_CAI, species)) +
   geom_point(dat.GoI.df, mapping = aes(Sr_CAI, Ce_CAI, color = species),
              show.legend = F,
              shape = 1, size = 2, alpha = 1) +
-  scale_color_manual(values = c("seagreen4", "coral4", "darkgoldenrod4", "darkorchid4"))+
+  scale_color_manual(values = c("seagreen4", 
+                                "coral4", 
+                                "darkgoldenrod4", 
+                                "darkorchid4"))+
   
   geom_hline(yintercept = 0.5, color = "grey", linetype = 2) +
   geom_vline(xintercept = 0.5, color = "grey", linetype = 2) +
@@ -298,11 +316,14 @@ cai_plot <- ggplot(tbl, aes(Sr_CAI, Ce_CAI, species)) +
   #             shape = 1, size = 2, alpha = 1, color = "coral4") +
   facet_grid(~species) +
   labs(title = "Species-specific codon adaptiveness",
-       subtitle = "colored icons = species-specific chemoreceptors; black icons = all coding sequences
-         ",
+       subtitle = paste("colored icons = species-specific chemoreceptors;",
+       "black icons = all coding sequences
+         "),
        x = "Codon bias relative to \n S. ratti usage rules (CAI)",
        y = "Codon Bias relative to \n C. elegans usage rules (CAI)",
-       caption = "Blue line/shading = linear regression \n w/ 95% confidence regions; \n formula = y ~ x") +
+       caption = "Blue line/shading = linear regression
+                       w/ 95% confidence regions;
+                       formula = y ~ x") +
   coord_equal(xlim = c(0,1), ylim = c(0,1)) +
   theme_bw() +
   theme(plot.title.position = "plot",

diff --git a/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.html b/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.html
diff --git a/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.nb.html b/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.nb.html
diff --git a/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.pdf b/Offline Analysis/Chemoreceptor_Codon_Adaptiveness.pdf
diff --git a/Offline Analysis/ChemosensoryCodonUsagePlot.pdf b/Offline Analysis/ChemosensoryCodonUsagePlot.pdf
diff --git a/Offline Analysis/CodonAdaptivenessDistributions.pdf b/Offline Analysis/CodonAdaptivenessDistributions.pdf