# Workflow: GC, AA, and pI

Workflow for compiling GC content, amino acid proportions, and isoelectric points of predicted genes and proteins, as used in the study *'Genomic markers show the proliferation of DNA viruses is constrained by adaptation to environmental niche'*.

Method: 

- Per sample:
  - Predict genes and protein sequence via DRAMv (prodigal-gv)
  - Calculate animo acid proportions per predicted protein
  - Generate summaries for AA types including acidic, basic, polar, nonpolar, and charged amino acids (defined as per pepstats (EMBOSS v6.6.0))
  - Identify gene GC content
  - Output summary table of results
  - Also calculate protein isolectric points (pI)

***

## Waiwera estuary data

Note re: study design

- This was developed based on metagenome assemblies from water and sediment samples at nine sites spanning a salinity gradient in the Waiwera estuary, New Zealand.
- Samples were labelled S1-S9 for Filter and sediment samples (e.g. S1.Filter; S1.sediment)
- Gene prediction and annotation was conducted via DRAMv for all putative viral contigs identified from each assembly
- This workflow uses the following output files generated by DRAMv: `genes.faa`; `final-viral-combined-for-dramv.gbk`; `annotations.tsv`

### Protein AA proportions and gene GC content

In [None]:
# Load python
module purge
module load Python/3.8.2-gimkl-2020a
python3

### Import required libraries
import pandas as pd
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser
import re
import os
from Bio import SeqIO

# Establish empty variable lists to append results
sampleID_list = []
contigID_list = []
geneID_list = []
gene_count_list = []
start_list = []
end_list = []
nt_seq_length_list = []
gc_content_list = []
seq_aa_list = []
aa_seq_length_list = []
glu_list = []
asp_list = []
arg_list = []
lys_list = []
hist_list = []
glycine_list = []
serine_list = []
tyrosine_list = []
cysteine_list = []
glutamine_list = []
asparagine_list = []
threonine_list = []
phenylalanine_list = []
leucine_list = []
tryptophan_list = []
proline_list = []
isoleucine_list = []
methionine_list = []
valine_list = []
alanine_list = []
aa_acidic_list = []
aa_basic_list = []
aa_polar_list = []
aa_nonpolar_list = []
aa_charged_list = []

# Loop through each sample genes.faa file to extract values for each variable and append to lists
# Also calculate propotions for amino  acids and summaries of aa types.
for i in range(1, 10):
    for j in ['Filter', 'sediment']:
        with open('8.DRAMv/persample/S'+str(i)+'.'+j+'/dramv_annotation/genes.faa', 'r') as read_fasta:
            for name, seq in SimpleFastaParser(read_fasta):
                headers = name.split(' ')
                sampleID_list.append('S'+str(i)+'.'+j)
                contigID_list.append(re.sub('_\d+$', '', headers[0].strip()))
                geneID_list.append(headers[0].strip())
                gene_count_list.append(re.sub(r'.*_(\d+)$', r'\1', headers[0].strip()))
                seq_aa_list.append(seq)
                aa_seq_length = len(seq)
                aa_seq_length_list.append(aa_seq_length)
                glu_list.append(seq.count('E')/aa_seq_length)
                asp_list.append(seq.count('D')/aa_seq_length)
                arg_list.append(seq.count('R')/aa_seq_length)
                lys_list.append(seq.count('K')/aa_seq_length)
                hist_list.append(seq.count('H')/aa_seq_length)
                glycine_list.append(seq.count('G')/aa_seq_length)
                serine_list.append(seq.count('S')/aa_seq_length)
                tyrosine_list.append(seq.count('Y')/aa_seq_length)
                cysteine_list.append(seq.count('C')/aa_seq_length)
                glutamine_list.append(seq.count('Q')/aa_seq_length)
                asparagine_list.append(seq.count('N')/aa_seq_length)
                threonine_list.append(seq.count('T')/aa_seq_length)
                phenylalanine_list.append(seq.count('F')/aa_seq_length)
                leucine_list.append(seq.count('L')/aa_seq_length)
                tryptophan_list.append(seq.count('W')/aa_seq_length)
                proline_list.append(seq.count('P')/aa_seq_length)
                isoleucine_list.append(seq.count('I')/aa_seq_length)
                methionine_list.append(seq.count('M')/aa_seq_length)
                valine_list.append(seq.count('V')/aa_seq_length)
                alanine_list.append(seq.count('A')/aa_seq_length)
                aa_acidic_list.append((seq.count('E')/aa_seq_length)+(seq.count('D')/aa_seq_length))
                aa_basic_list.append((seq.count('R')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('H')/aa_seq_length))
                aa_polar_list.append((seq.count('D')/aa_seq_length)+(seq.count('E')/aa_seq_length)+(seq.count('H')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('N')/aa_seq_length)+(seq.count('Q')/aa_seq_length)+(seq.count('R')/aa_seq_length)+(seq.count('S')/aa_seq_length)+(seq.count('T')/aa_seq_length)+(seq.count('Z')/aa_seq_length))
                aa_nonpolar_list.append((seq.count('A')/aa_seq_length)+(seq.count('C')/aa_seq_length)+(seq.count('F')/aa_seq_length)+(seq.count('G')/aa_seq_length)+(seq.count('I')/aa_seq_length)+(seq.count('L')/aa_seq_length)+(seq.count('M')/aa_seq_length)+(seq.count('P')/aa_seq_length)+(seq.count('V')/aa_seq_length)+(seq.count('W')/aa_seq_length)+(seq.count('Y')/aa_seq_length))
                aa_charged_list.append((seq.count('B')/aa_seq_length)+(seq.count('D')/aa_seq_length)+(seq.count('E')/aa_seq_length)+(seq.count('H')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('R')/aa_seq_length)+(seq.count('Z')/aa_seq_length))

df = pd.DataFrame({'sampleID': sampleID_list, 
                   'contigID': contigID_list, 
                   'geneID': geneID_list, 
                   'gene_count': gene_count_list,
                   'aa_seq': seq_aa_list,
                   'aa_seq_length': aa_seq_length_list,
                   'aa_prop_Glutamic_acid': glu_list,
                   'aa_prop_Aspartic_acid': asp_list,
                   'aa_prop_Arginine': arg_list,
                   'aa_prop_Lysine': lys_list,
                   'aa_prop_Histidine': hist_list,
                   'aa_prop_Glycine': glycine_list,
                   'aa_prop_Serine': serine_list,
                   'aa_prop_Tyrosine': tyrosine_list,
                   'aa_prop_Cysteine': cysteine_list,
                   'aa_prop_Glutamine': glutamine_list,
                   'aa_prop_Asparagine': asparagine_list,
                   'aa_prop_Threonine': threonine_list,
                   'aa_prop_Phenylalanine': phenylalanine_list,
                   'aa_prop_Leucine': leucine_list,
                   'aa_prop_Tryptophan': tryptophan_list,
                   'aa_prop_Proline': proline_list,
                   'aa_prop_Isoleucine': isoleucine_list,
                   'aa_prop_Methionine': methionine_list,
                   'aa_prop_Valine': valine_list,
                   'aa_prop_Alanine': alanine_list,
                   'aa_prop_Acidic': aa_acidic_list,
                   'aa_prop_Basic': aa_basic_list,
                   'aa_prop_Polar': aa_polar_list,
                   'aa_prop_Nonpolar': aa_nonpolar_list,
                   'aa_prop_charged': aa_charged_list
                  })


# Pull in annotations and gbk files to make df of other fields of interest (e.g. start, end, seq length, gc content)
annot_dfs = []
for i in range(1, 10):
    for j in ['Filter', 'sediment']:
        gbk_seq_ids = []
        gbk_gene_ids = []
        gbk_gene_gc = []
        with open('8.DRAMv/persample/S'+str(i)+'.'+j+'/dramv_annotation/genbank/final-viral-combined-for-dramv.gbk', 'r') as genbank_infile:
            for seq_record in SeqIO.parse(genbank_infile, "genbank"):
                for gene in seq_record.features:
                    if gene.type == 'CDS':
                        gbk_seq_ids.append(seq_record.id)
                        gbk_gene_ids.append(gene.qualifiers['gene'][0])
                        gbk_gene_gc.append(gene.qualifiers['gc_cont'][0])
        annot_df = pd.read_csv('8.DRAMv/persample/S'+str(i)+'.'+j+'/dramv_annotation/annotations.tsv', sep='\t').rename(columns={'Unnamed: 0':'geneID', 'start_position': 'start','end_position': 'end'})
        annot_df = annot_df[['geneID', 'start', 'end']+[col for col in annot_df if 'vogdb' in col]]
        annot_df = pd.merge(
            pd.DataFrame({'contigID': gbk_seq_ids, 'geneID': gbk_gene_ids, 'gene_gc': gbk_gene_gc}),
            annot_df,
            how = 'outer', 
            on = 'geneID')
        annot_df['sampleID'] = 'S'+str(i)+'.'+j
        annot_dfs.append(annot_df)

annot_df_full = pd.concat(annot_dfs).reset_index(drop=True)

full_df = pd.merge(df, annot_df_full, how='left', on=['sampleID','contigID', 'geneID'])
full_df = full_df.rename(columns={'start': 'nt_start', 'end': 'nt_end', 'gene_gc': 'gc_content'})
full_df['nt_seq_length'] = full_df['nt_end'] - full_df['nt_start']

# write out summary table
full_df.to_csv('AA_and_GC.summary_table.tsv', sep='\t', index=False)

quit()


### Protein isoelectric point (pI)

Run pepstats

In [None]:
# Set up working directories
module load EMBOSS/6.6.0-gimkl-2020a

for i in {1..9}; do
    for j in Filter sediment; do
        echo "Sample: S${i}.${j}"
        pepstats "8.DRAMv/persample/S${i}.${j}/dramv_annotation/genes.faa" -outfile "S${i}.${j}.pepstats"
    done
done


Summarise pepstats pI results

In [None]:
# Load python
module purge
module load Python/3.8.2-gimkl-2020a
python3

### Import required libraries
import pandas as pd
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser
import re
import os
from Bio import SeqIO
from itertools import groupby, chain

def get_gene_pepstats(file):
    with open(file) as f:
        grps = groupby(f, key=lambda x: x.lstrip().startswith("PEPSTATS of"))
        for k, v in grps:
            if k:
                yield chain([next(v)], (next(grps)[1]))  

iep_dfs = []
for i in range(1, 10):
    for j in ['Filter', 'sediment']:
        sampleID = "S"+str(i)+"."+j
        iep_dict = {}
        for gene_pepstats in get_gene_pepstats(sampleID+".pepstats"):
            gene_pepstats_tmp = (list(gene_pepstats))
            geneID = [x for x in gene_pepstats_tmp if 'PEPSTATS of' in x][0].split(' ')[2]
            iso_point = [x for x in gene_pepstats_tmp if 'Isoelectric Point' in x][0].split(' ')[3].rstrip()
            iep_dict[geneID] = iso_point
        iep_df_tmp = pd.DataFrame(iep_dict.items(), columns=['geneID', 'isoelectric_point'])
        iep_df_tmp['sampleID'] = sampleID
        iep_dfs.append(iep_df_tmp)

iep_df = pd.concat(iep_dfs).reset_index(drop=True)
iep_df.to_csv('pepstats.summaryTable.tsv', sep='\t', index=False)


***

## IMG/VR high-qualtiy viruses

Note:

- Gene prediction for all high-quality viruses in the IMG/VR database (v7.1) was conducted via prodigal-gv
- Gene annotation was conducted via hmmsearch of predicted proteins from the IMG/VR dataset against proteins in the viral orthologous groups (VOG) database
- This workflow uses the following files generated by prodigal-gv and hmmsearch: `prodigal_gv.faa`; `final-viral-combined-for-dramv.gbk`; `annotations.tsv`

### Protein AA proportions and gene GC content

In [None]:
# Load python
module purge
module load Python/3.8.2-gimkl-2020a
python3

### Import required libraries
import pandas as pd
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser
import re
import os
from Bio import SeqIO

# Establish empty variable lists to append results
sampleID_list = []
contigID_list = []
geneID_list = []
gene_count_list = []
start_list = []
end_list = []
nt_seq_length_list = []
gc_content_list = []
seq_aa_list = []
aa_seq_length_list = []
glu_list = []
asp_list = []
arg_list = []
lys_list = []
hist_list = []
glycine_list = []
serine_list = []
tyrosine_list = []
cysteine_list = []
glutamine_list = []
asparagine_list = []
threonine_list = []
phenylalanine_list = []
leucine_list = []
tryptophan_list = []
proline_list = []
isoleucine_list = []
methionine_list = []
valine_list = []
alanine_list = []
aa_acidic_list = []
aa_basic_list = []
aa_polar_list = []
aa_nonpolar_list = []
aa_charged_list = []

# prodigal.faa file to extract values for each variable and append to lists
# Also calculate propotions for amino  acids
with open('img_vr_All_HighQuality/prodigal_gv/prodigal_gv.faa', 'r') as read_fasta:
    for name, seq in SimpleFastaParser(read_fasta):
        if 'IMGVR' in name:
            headers = name.split('#')
            sampleID_list.append(re.sub('_UViG.*', '', headers[0].strip()))
            contigID_list.append(re.sub('_\d+$', '', headers[0].strip()))
            geneID_list.append(headers[0].strip())
            gene_count_list.append(re.sub(r'.*_(\d+)$', r'\1', headers[0].strip()))
            start_list.append(float(headers[1].strip()))
            end_list.append(float(headers[2].strip()))
            nt_seq_length_list.append(float(headers[2].strip())-float(headers[1].strip()))
            gc_content_list.append(float(headers[-1].split('gc_cont=')[-1].strip()))
            seq_aa_list.append(seq)
            aa_seq_length = len(seq)
            aa_seq_length_list.append(aa_seq_length)
            glu_list.append(seq.count('E')/aa_seq_length)
            asp_list.append(seq.count('D')/aa_seq_length)
            arg_list.append(seq.count('R')/aa_seq_length)
            lys_list.append(seq.count('K')/aa_seq_length)
            hist_list.append(seq.count('H')/aa_seq_length)
            glycine_list.append(seq.count('G')/aa_seq_length)
            serine_list.append(seq.count('S')/aa_seq_length)
            tyrosine_list.append(seq.count('Y')/aa_seq_length)
            cysteine_list.append(seq.count('C')/aa_seq_length)
            glutamine_list.append(seq.count('Q')/aa_seq_length)
            asparagine_list.append(seq.count('N')/aa_seq_length)
            threonine_list.append(seq.count('T')/aa_seq_length)
            phenylalanine_list.append(seq.count('F')/aa_seq_length)
            leucine_list.append(seq.count('L')/aa_seq_length)
            tryptophan_list.append(seq.count('W')/aa_seq_length)
            proline_list.append(seq.count('P')/aa_seq_length)
            isoleucine_list.append(seq.count('I')/aa_seq_length)
            methionine_list.append(seq.count('M')/aa_seq_length)
            valine_list.append(seq.count('V')/aa_seq_length)
            alanine_list.append(seq.count('A')/aa_seq_length)
            aa_acidic_list.append((seq.count('E')/aa_seq_length)+(seq.count('D')/aa_seq_length))
            aa_basic_list.append((seq.count('R')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('H')/aa_seq_length))
            aa_polar_list.append((seq.count('D')/aa_seq_length)+(seq.count('E')/aa_seq_length)+(seq.count('H')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('N')/aa_seq_length)+(seq.count('Q')/aa_seq_length)+(seq.count('R')/aa_seq_length)+(seq.count('S')/aa_seq_length)+(seq.count('T')/aa_seq_length)+(seq.count('Z')/aa_seq_length))
            aa_nonpolar_list.append((seq.count('A')/aa_seq_length)+(seq.count('C')/aa_seq_length)+(seq.count('F')/aa_seq_length)+(seq.count('G')/aa_seq_length)+(seq.count('I')/aa_seq_length)+(seq.count('L')/aa_seq_length)+(seq.count('M')/aa_seq_length)+(seq.count('P')/aa_seq_length)+(seq.count('V')/aa_seq_length)+(seq.count('W')/aa_seq_length)+(seq.count('Y')/aa_seq_length))
            aa_charged_list.append((seq.count('B')/aa_seq_length)+(seq.count('D')/aa_seq_length)+(seq.count('E')/aa_seq_length)+(seq.count('H')/aa_seq_length)+(seq.count('K')/aa_seq_length)+(seq.count('R')/aa_seq_length)+(seq.count('Z')/aa_seq_length))

df = pd.DataFrame({'sampleID': sampleID_list, 
                   'contigID': contigID_list, 
                   'geneID': geneID_list, 
                   'aa_prop_Lysine': lys_list,
                   'aa_prop_Histidine': hist_list,
                   'aa_prop_Glycine': glycine_list,
                   'aa_prop_Serine': serine_list,
                   'aa_prop_Tyrosine': tyrosine_list,
                   'aa_prop_Cysteine': cysteine_list,
                   'aa_prop_Glutamine': glutamine_list,
                   'aa_prop_Asparagine': asparagine_list,
                   'aa_prop_Threonine': threonine_list,
                   'aa_prop_Phenylalanine': phenylalanine_list,
                   'aa_prop_Leucine': leucine_list,
                   'aa_prop_Tryptophan': tryptophan_list,
                   'aa_prop_Proline': proline_list,
                   'aa_prop_Isoleucine': isoleucine_list,
                   'aa_prop_Methionine': methionine_list,
                   'aa_prop_Valine': valine_list,
                   'aa_prop_Alanine': alanine_list,
                   'aa_prop_Acidic': aa_acidic_list,
                   'gene_count': gene_count_list,
                   'nt_start': start_list, 
                   'nt_end': end_list, 
                   'nt_seq_length': nt_seq_length_list,
                   'gc_content': gc_content_list, 
                   'aa_seq': seq_aa_list,
                   'aa_seq_length': aa_seq_length_list,
                   'aa_prop_Glutamic_acid': glu_list,
                   'aa_prop_Aspartic_acid': asp_list,
                   'aa_prop_Arginine': arg_list,
                   'aa_prop_Basic': aa_basic_list,
                   'aa_prop_Polar': aa_polar_list,
                   'aa_prop_Nonpolar': aa_nonpolar_list,
                   'aa_prop_charged': aa_charged_list
                  })

# write out summary table
df_filt = df[['sampleID','contigID','geneID','gc_content']+[col for col in df.columns if 'aa_prop' in col]]
df_filt.to_csv('img_vr_All_HighQuality.AA_and_GC.summary_table.tsv', sep='\t', index=False)

quit()



## Protein isoelectric point (pI)

- note: pepstats trims IDs at '|' character, so need to edit these

Run pepstats

In [None]:
# Replace '|' characters with '__'
cp img_vr_All_HighQuality/prodigal_gv/prodigal_gv.faa img_vr_All_HighQuality/
sed -i -e 's/|/__/g' img_vr_All_HighQuality/prodigal_gv.faa

module load EMBOSS/6.6.0-gimkl-2020a

pepstats "img_vr_All_HighQuality/prodigal_gv.faa" -outfile "img_vr_All_HighQuality/img_vr_HighQuality.pepstats"


Summarise pepstats pI results

In [None]:
# Load python
module purge
module load Python/3.8.2-gimkl-2020a
python3

### Import required libraries
import pandas as pd
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser
import re
import os
from Bio import SeqIO
from itertools import groupby, chain

def get_gene_pepstats(file):
    with open(file) as f:
        grps = groupby(f, key=lambda x: x.lstrip().startswith("PEPSTATS of"))
        for k, v in grps:
            if k:
                yield chain([next(v)], (next(grps)[1]))  

iep_dict = {}
for gene_pepstats in get_gene_pepstats("img_vr_All_HighQuality/img_vr_HighQuality.pepstats"):
    gene_pepstats_tmp = (list(gene_pepstats))
    geneID = [x for x in gene_pepstats_tmp if 'PEPSTATS of' in x][0].split(' ')[2].replace('__', '|')
    iso_point = [x for x in gene_pepstats_tmp if 'Isoelectric Point' in x][0].split(' ')[3].rstrip()
    iep_dict[geneID] = iso_point

iep_df = pd.DataFrame(iep_dict.items(), columns=['geneID', 'isoelectric_point'])
iep_df['sampleID'] = 'imgvr'

iep_df.to_csv('img_vr_All_HighQuality/img_vr_HighQuality.pepstats.summaryTable.tsv', sep='\t', index=False)

quit()


***

***