-
Notifications
You must be signed in to change notification settings - Fork 0
/
Genomic_protein_composition_analysis_publication.sh
102 lines (64 loc) · 2.85 KB
/
Genomic_protein_composition_analysis_publication.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# scripts to run composition analysis using as input protein and nucleotide fasta sequences
# Run analysis using the R environment
# upload library
library(seqinr)
############# calculate aminoacid frequencies and pI (theoretical isoelectric point)
# Get the list of all *.txt files in the current directory
prot_files <- list.files(pattern = "_prot.faa")
# Loop through the list of files
for (file in prot_files) {
# Read the file into a variable
myProts <- read.fasta(file = file, seqtype = "AA")
# Get stats from all amino acid sequences
all_results <- lapply(myProts, AAstat, plot=FALSE)
# Get amino acid proportions
all_proportions <- lapply(all_results, function(x) x$Prop)
table <- do.call(rbind, all_proportions)
# Print composition of all proteins
all_composition <- lapply(all_results, function(x) x$Compo)
table2 <- do.call(rbind, all_composition)
# Get protein length
lengths <- sapply(myProts, getLength)
# Calculate the pI (theoretical isoelectric point) for each protein sequence
pIs <- sapply(myProts, computePI)
# Combine the tables
table_all_combined <- cbind(table,table2,lengths,pIs)
# Write the output to a CSV file
write.csv(table_all_combined, file = paste(file, "_aminoacid_stats.csv", sep=""))
}
############# calculate codon frequencies and codon usage bias
# upload library
library(seqinr)
# Get the list of all *.txt files in the current directory
seq_files <- list.files(pattern = "_nucl.fna")
# Loop through the list of files
for (file in seq_files) {
# Read the file into a variable
sequences <- read.fasta(file = file, seqtype = "DNA")
codons_rscu <- lapply(sequences, uco, index="rscu", NA.rscu = 0)
table_codons_rscu <- do.call(rbind, codons_rscu)
# Write the output to a CSV file
write.csv(table_codons_rscu, file = paste(file, "_table_codons_rscu.csv", sep=""))
}
############# calculate Purine loading-index
# upload library
library(seqinr)
seq_files <- list.files(pattern = "_nucl.fna$") # whole CDSs, make case specific $
# Loop through the list of files
for (file in seq_files) {
# Read the file into a variable
sequences <- read.fasta(file = file, seqtype = "DNA")
A_count <- sapply(sequences, function(x) sum(toupper(x) == "A"))
T_count <- sapply(sequences, function(x) sum(toupper(x) == "T"))
G_count <- sapply(sequences, function(x) sum(toupper(x) == "G"))
C_count <- sapply(sequences, function(x) sum(toupper(x) == "C"))
N_length <- sapply(sequences, length)
purines <- A_count+G_count
pyrimidines <- C_count+T_count
dW <- (A_count - T_count) / N_length *1000
dS <- (G_count - C_count) / N_length *1000
PLI_purine_loading_index <- dW + dS
purr <- cbind(A_count,T_count,G_count,C_count,purines,pyrimidines,PLI_purine_loading_index)
# Write the output to a CSV file
write.csv(purr, file = paste(file, "_purine_loading_index_PLI.csv", sep=""))
}