In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import re

In [2]:
aurora_strc_positions_0 = [3,5,10,11,13,14,15,16,21,22,25,26,27,28,29,33,36,39,41,42,43,44,45,46]
aurora_ctrl_positions_0 = [1,3,8,10,11,13,18,19,20,22,26,27,28,29,30,32,33,35,36,37,38,40,43,46]

degenerate_symbols = {
    'A' : ['A'],
    'C' : ['C'],
    'G' : ['G'],
    'T' : ['T'],
#    'U' : ['U'],
    'W' : ['A', 'T'],
    'S' : ['C', 'G'],
    'M' : ['A', 'C'],
    'K' : ['G', 'T'],
    'R' : ['A', 'G'],
    'Y' : ['C', 'T'],
    'B' : ['C', 'G', 'T'],
    'D' : ['A', 'G', 'T'],
    'H' : ['A', 'C', 'T'],
    'V' : ['A', 'C', 'G'],
    'N' : ['A', 'C', 'G', 'T'],
    'Z' : [],
}

lib_designs_strc = [
    'GGAGGRGATGRATRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYATY',
    'GGAAGRGATGRATRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYTTY',
    'GGAGGRGATGRCTRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYAGY',
    'GGAGGRGATGRATRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYATY',
    'GGAAGRGATGRCTRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYTGY',
    'GGAAGRGATGRATRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYTTY',
    'GGAGGRGATGRCTRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYAGY',
    'GGAAGRGATGRCTRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYTGY'
]

lib_designs_ctrl = [
    'GGAMGGGAKGABTMTGTCCKKTDCCTKKWMRGYVTRYRKAGTGKTGY',
    'GCAMGGGAKGGBTMTGTCCKKTDCCTKKWMRGYVTRYRKAGTGKTGY',
    'GGAMGGGAKGABTMTGTCAKKTDCCTKKWMRGYVTRYRKATTGKTGY',
    'GGAMGGGAKGABTMTGTCCKKTDCCTKYWMRGYVTRYRKAGTGSTGY',
    'GCAMGGGAKGGBTMTGTCAKKTDCCTKKWMRGYVTRYRKATTGKTGY',
    'GCAMGGGAKGGBTMTGTCCKKTDCCTKYWMRGYVTRYRKAGTGSTGY',
    'GGAMGGGAKGABTMTGTCAKKTDCCTKYWMRGYVTRYRKATTGSTGY',
    'GCAMGGGAKGGBTMTGTCAKKTDCCTKYWMRGYVTRYRKATTGSTGY'
]

In [3]:
def makePattern(P):
    p = ''
    for c in P:
        options = degenerate_symbols[c]
        if len(options) == 1:
            p += c
        else:
            p += '[' + '|'.join(options) + ']'
    return re.compile(p)

In [4]:
def matchSubpool(S, P):
    for i in range(0, len(P)):
        if re.search(P[i], S):
            return i
        else:
            pass
    return None

In [5]:
def prepDataset(d, nonconstant_positions, patterns):

    # Load the original dataset
    df = pd.read_csv(d, sep='\t', names=['count', 'seq'])
    
    # Get rid of sequences which contain Ns
    df = df.loc[[True if 'N' not in s else False for s in df['seq']]]

    # Assign subpools
    df['subpool'] = [matchSubpool(s, patterns) for s in df['seq']]

    # Get the non-constant positions
    df['varseq'] = [''.join([s[x] for x in nonconstant_positions]) for s in df['seq']]
    
    # Compute cpms/ppms from counts
    df['cpm'] = df['count'] / df['count'].sum() * 1000000

    return df

In [6]:
# Make the regexp patterns from each
patterns_ctrl = [makePattern(o) for o in lib_designs_ctrl]
patterns_strc = [makePattern(o) for o in lib_designs_strc]

In [7]:
# Sec. Structure init
df_prepped = prepDataset('../datasets_original/MV_aurora_secstructlib_small_init.tsv', aurora_strc_positions_0, patterns_strc)
df_prepped.to_csv('strc_init.csv')

In [8]:
# Sec. Structure km
df_prepped = prepDataset('../datasets_original/MV_aurora_secstructlib_small_km.tsv', aurora_strc_positions_0, patterns_strc)
df_prepped.to_csv('strc_km.csv')

In [9]:
# Sec. Structure kcat
df_prepped = prepDataset('../datasets_original/MV_aurora_secstructlib_small_kcat.tsv', aurora_strc_positions_0, patterns_strc)
df_prepped.to_csv('strc_kcat.csv')

In [10]:
# Control init
df_prepped = prepDataset('../datasets_original/MV_aurora_secstructlib_control2_init.tsv', aurora_ctrl_positions_0, patterns_ctrl)
df_prepped.to_csv('ctrl_init.csv')

In [11]:
# Control km
df_prepped = prepDataset('../datasets_original/MV_aurora_secstructlib_control2_km.tsv', aurora_ctrl_positions_0, patterns_ctrl)
df_prepped.to_csv('ctrl_km.csv')