In [1]:
import os

DIR = r'c://downloads'

In [2]:
import re

with open(os.path.join(DIR, 'sequence.txt'), 'r') as f:
    raw_seq = f.read()
    
seq = re.sub('[\d\W]+', '', raw_seq)
print(seq)

MHVHMRTHTGEKPFPCDICGKAFSQLTNMHAHMRTHTREKPFPCDICGTAFTQSSALYVHMRIHTGKKSFHCDSCGKNFSRRYNLRVHIMRAHKGKKPLHGDSAGCKKDSSAQKHAASKEPS


In [3]:
zf_C2H2_profile_regex = '[YFHL].C..C...[FYLT].{5}[LFYVM]..H...H'
print(re.findall(zf_C2H2_profile_regex, seq))

['FPCDICGKAFSQLTNMHAHMRTH', 'FPCDICGTAFTQSSALYVHMRIH']


In [4]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

cleavage_pattern = re.compile('RR|K[KR]|R..[KR]')
cleavage_products = []

def create_cleavage_record(seq, record_id, index_within_record):
    return SeqRecord(seq, id = 'Cleavapge product %d of %s' % (index_within_record + 1, record_id), description = '')

for record in SeqIO.parse(os.path.join(DIR, 'neuropeptides.fasta'), 'fasta'):
    
    start_index = 0
    
    for i, cleavage_match in enumerate(cleavage_pattern.finditer(str(record.seq))):
        end_index = cleavage_match.end()
        cleavage_products.append(create_cleavage_record(record.seq[start_index:end_index], record.id, i))
        start_index = end_index
    
    if start_index < len(record.seq):
        cleavage_products.append(create_cleavage_record(record.seq[start_index:], record.id, i + 1))
    
SeqIO.write(cleavage_products, os.path.join(DIR, 'neuropeptides-cleavage-products.fasta'), 'fasta')

1526

In [5]:
import csv

import numpy as np

NUMERIC_COLUMNS = {'Subunits', 'Inner Radius', 'Outer Radius', 'Average Radius', 'Net Surface Charge', 'Outside SASA'}

data = {}

with open(os.path.join(DIR, 'viperdb.csv'), 'r') as f:
    
    csv_reader = csv.reader(f)
    headers = next(csv_reader)
    lines = list(csv_reader)
    
    for i, header in enumerate(headers):
        column_type = int if header in NUMERIC_COLUMNS else str
        raw_values = [line[i] for line in lines]
        values = [np.nan if raw_value == 'N/A' else column_type(raw_value) for raw_value in raw_values]
        data[header] = np.array(values)

In [6]:
for column_name in ['Inner Radius', 'Outer Radius', 'Average Radius']:
    values = data[column_name]
    print('%s: avg = %.2f, std = %.2f' % (column_name, np.average(values), np.std(values)))

Inner Radius: avg = 126.56, std = 64.69
Outer Radius: avg = 196.02, std = 85.65
Average Radius: avg = 188.29, std = 76.48


In [7]:
n_records = len(data['Entry ID'])
is_outlier = np.zeros(n_records, dtype = bool)

for column_name in ['Inner Radius', 'Outer Radius', 'Average Radius']:
    values = data[column_name]
    is_outlier |= (np.abs(values - np.average(values)) >= 2.5 * np.std(values))
    
print('There are %d outliers.' % np.sum(is_outlier))
print('Outlier families: ' + ', '.join(sorted(set((data['Family'][is_outlier])))))
print('Avg. subunits among non-outliers: %.2f' % np.average(data['Subunits'][~is_outlier]))

There are 15 outliers.
Outlier families: Adenoviridae, Mimiviridae, Phycodnaviridae, Reoviridae, Rudiviridae
Avg. subunits among non-outliers: 149.26
