In [14]:
import pandas as pd
from itertools import product

In [15]:
def undegenerate(S):
    variants = [degenerate_symbols[s] for s in S]
    variants = [x for xs in variants for x in xs]
    return variants

In [16]:
degenerate_symbols = {
    'A' : ['A'],
    'C' : ['C'],
    'G' : ['G'],
    'T' : ['T'],
#    'U' : ['U'],
    'W' : ['A', 'T'],
    'S' : ['C', 'G'],
    'M' : ['A', 'C'],
    'K' : ['G', 'T'],
    'R' : ['A', 'G'],
    'Y' : ['C', 'T'],
    'B' : ['C', 'G', 'T'],
    'D' : ['A', 'G', 'T'],
    'H' : ['A', 'C', 'T'],
    'V' : ['A', 'C', 'G'],
    'N' : ['A', 'C', 'G', 'T'],
    'Z' : [],
}

In [17]:
lib_design_strc = [
    'GGAGGRGATGRATRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYATY',
    'GGAAGRGATGRATRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYTTY',
    'GGAGGRGATGRCTRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYAGY',
    'GGAGGRGATGRATRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYATY',
    'GGAAGRGATGRCTRYRYCCGGKHCCRKWWYGGCDTGHGGRGYRYTGY',
    'GGAAGRGATGRATRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYTTY',
    'GGAGGRGATGRCTRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYAGY',
    'GGAAGRGATGRCTRYRYCCGGKHCCYKWWRGGCDTGHGGRGYRYTGY'
]

lib_design_ctrl = [
    'GGAMGGGAKGABTMTGTCCKKTDCCTKKWMRGYVTRYRKAGTGKTGY',
    'GCAMGGGAKGGBTMTGTCCKKTDCCTKKWMRGYVTRYRKAGTGKTGY',
    'GGAMGGGAKGABTMTGTCAKKTDCCTKKWMRGYVTRYRKATTGKTGY',
    'GGAMGGGAKGABTMTGTCCKKTDCCTKYWMRGYVTRYRKAGTGSTGY',
    'GCAMGGGAKGGBTMTGTCAKKTDCCTKKWMRGYVTRYRKATTGKTGY',
    'GCAMGGGAKGGBTMTGTCCKKTDCCTKYWMRGYVTRYRKAGTGSTGY',
    'GGAMGGGAKGABTMTGTCAKKTDCCTKYWMRGYVTRYRKATTGSTGY',
    'GCAMGGGAKGGBTMTGTCAKKTDCCTKYWMRGYVTRYRKATTGSTGY'
]

In [18]:
df_strc = pd.DataFrame([list(s) for s in lib_design_strc])
df_ctrl = pd.DataFrame([list(s) for s in lib_design_ctrl])

In [19]:
df_strc.to_csv('library_design_table_strc.csv')
df_ctrl.to_csv('library_design_table_ctrl.csv')

In [20]:
overlapping_positions = []
for i in range(0, len(df_strc.columns)):
    col1, col2 = df_strc[i], df_ctrl[i]
    if (col1.nunique() == 1) and (col2.nunique() == 1):
        if col1.unique() == col2.unique():
            overlapping_positions.append(i+1)

overlapping_positions

[1, 3, 5, 7, 8, 10, 13, 18, 24, 25, 27, 29, 32, 35, 47]

In [21]:
with open('overlapping_positions.txt', mode='w')as f:
    f.write(','.join([str(x) for x in overlapping_positions]))

In [28]:
res = []
overlapping_variants = []
for i in range(0, len(df_strc.columns)):
    col1, col2 = df_strc[i], df_ctrl[i]
    variants_col1 = set(undegenerate(col1.unique().tolist()))
    variants_col2 = set(undegenerate(col2.unique().tolist()))
    intersection = variants_col1.intersection(variants_col2)
    union = variants_col1.union(variants_col2)
    union_size = len(union)
    intersection_size = len(intersection)
    res.append([
        ''.join(col1.unique().tolist()), 
        ''.join(col2.unique().tolist()),
        ''.join(list(variants_col1)),
        ''.join(list(variants_col2)),
        ''.join(list(intersection)), 
        ''.join(list(union)), 
        intersection_size,
        union_size,
        intersection_size/union_size
    ])

        
df_res = pd.DataFrame(res, columns=[
    'def_strc',
    'def_ctrl',
    'unique_strc',
    'unique_ctrl',
    'intersection',
    'union',
    'intersection_size',
    'union_size',
    'inter_over_union'
])

In [29]:
df_res.index = df_res.index + 1

In [30]:
df_res.to_csv('library_design_overlaps.csv')