In [1]:
import sys
import unittest

from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt

sys.path.append("../oligo_designer_toolsuite")

from oligo_designer_toolsuite.oligo_property_filter._property_filter import (
    PropertyFilter,

)

In [2]:
from oligo_designer_toolsuite.oligo_property_filter._filter_base import (
    GCContent,
    MaskedSequences,
    MeltingTemperature,
    ConsecutiveRepeats,
    GCClamp
)

class TestPreFilterBase(unittest.TestCase):
    """Test that the filtering classes return the corrected output when given in input a sequence"""

    @classmethod
    def setUpClass(self):
        """Define the filter classes and their parameters"""

        self.ConsecutiveRepeats = ConsecutiveRepeats(3)

        filters = [
            self.ConsecutiveRepeats
        ]
        self.pre_filter = PropertyFilter(filters=filters)

    def test_negative_consecutive_repeats(self):
        "Tests that the consecutive repeat filter returns false"
        fulfills, _ = self.ConsecutiveRepeats.apply(Seq("CTTGGGCCTTTCCAAGCCCCCATTTGAGCT"))
        self.assertEqual(
            fulfills,
            False,
            "A sequence not fulfilling the consecutive repeat has been accepted!",
        )

    def test_positive_consecutive_repeats(self):
        "Tests that the consecutive repeat filter returns true"
        fulfills, _ = self.ConsecutiveRepeats.apply(Seq("CTTGGGCCTTTCCAAGCCCATTTGAGCT"))
        self.assertEqual(
            fulfills,
            True,
            "A sequence fulfilling the consecutive repeat has been accepted!",
        )


In [3]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_negative_consecutive_repeats (__main__.TestPreFilterBase)
Tests that the consecutive repeat filter returns false ... ok
test_positive_consecutive_repeats (__main__.TestPreFilterBase)
Tests that the consecutive repeat filter returns true ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.010s

OK


<unittest.main.TestProgram at 0x7faf07fbbac0>

In [17]:
from scipy.spatial.distance import hamming
from numpy import random


def generate_codebook(num_seq: int, encoding_scheme: str):
    '''
    encoding_scheme: MHD2 or MHD4
    This function is using generator generate_binary_sequences() to return a list of encoding codes

    Params:
    n_bit: Number of bits in a code
    n_one: Number of ones in a code
    n_seq: Number of codes in a list
    '''


    # Initialize the list of sequences
    sequences = []

    if (encoding_scheme == "MHD4") & (num_seq <= 140):  # upper limit is 140
        num_ones = 4
        num_bits = 16
        hamming_distance = 4
        # Generate sequences until the desired number of sequences is reached
        while len(sequences) < num_seq:
            # Generate a new sequence with the desired number of 1s
            sequence = [1] * num_ones + [0] * (num_bits - num_ones)
            random.shuffle(sequence)

            # Check if the new sequence meets the minimum distance requirement
            if all(hamming_distance <= hamming(sequence, seq) * num_bits for seq in sequences):
                # Add the new sequence to the list of sequences
                sequences.append(sequence)

    elif (encoding_scheme == "MHD2") & (num_seq <= 1001): # upper limit is 1001
        num_ones = 4
        num_bits = 14
        min_hamming_distance = 2
        # Generate sequences until the desired number of sequences is reached
        while len(sequences) < num_seq:
            # Generate a new sequence with the desired number of 1s
            sequence = [1] * num_ones + [0] * (num_bits - num_ones)
            random.shuffle(sequence)

            # Check if the new sequence meets the minimum distance requirement
            if all(min_hamming_distance <= hamming(sequence, seq) * num_bits for seq in sequences):
                # Add the new sequence to the list of sequences
                sequences.append(sequence)
    # Convert the list of sequences to a list of strings and return it
    return ["".join(str(bit) for bit in sequence) for sequence in sequences]


In [7]:
generate_codebook(140,"MHD2")

['10000010000101',
 '11010100000000',
 '10001000010100',
 '11010001000000',
 '00010001001100',
 '00100000001101',
 '01000000001110',
 '10100100001000',
 '10001001000100',
 '00110010000001']

In [18]:
from scipy.spatial.distance import hamming

#define arrays
x = [0, 1, 1, 1, 0, 1]
y = [0, 0, 1, 1, 0, 0]

#calculate Hamming distance between the two arrays
hamming(x, y) * len(x)



seq1 = [0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0]
seq2 = [0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0]
seq3 = [0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0]
seq4 = [1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0]
print(hamming(seq1, seq2)*len(seq1))
print(hamming(seq1, seq3)*len(seq1))
print(hamming(seq1, seq4)*len(seq1))
print(hamming(seq2, seq3)*len(seq1))
print(hamming(seq2, seq4)*len(seq1))
print(hamming(seq3, seq4)*len(seq1))

4.0
8.0
8.0
4.0
6.0
4.0


In [67]:


def bits_to_list(bit_string):
    bit_list = [int(bit) for bit in bit_string]
    return bit_list
x = generate_codebook(140,"MHD2")
seq1 = bits_to_list(x[139])
seq2 = bits_to_list(x[1])
seq3 = bits_to_list(x[33])
seq4 = bits_to_list(x[4])
print(hamming(seq1, seq2)*len(seq1))
print(hamming(seq1, seq3)*len(seq1))
print(hamming(seq1, seq4)*len(seq1))
print(hamming(seq2, seq3)*len(seq1))
print(hamming(seq2, seq4)*len(seq1))
print(hamming(seq3, seq4)*len(seq1))

4.0
4.0
8.0
8.0
8.0
8.0


In [66]:
generate_codes(1)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [70]:
# latest_tutorial.py

import time
import pandas as pd
import random

"""Print the latest tutorial from Real Python"""
runtime = []
for i in range(500):
    tic = time.perf_counter()
    tutorial = generate_codebook(i,"MHD2")
    toc = time.perf_counter()
    runtime.append(toc - tic)
    #print(i)

df = pd.DataFrame(runtime)
df.plot()



KeyboardInterrupt: 

In [72]:
tic = time.perf_counter()
tutorial = generate_codebook(1,"MHD4")
toc = time.perf_counter()
print(toc-tic)
tutorial


9.782300003280398e-05


['1000010100000100']

In [79]:
def generate_hamming_codes(seq_num):
    codes = []
    # Generate all 4-bit codes
    for i in range(2**4):
        code = bin(i)[2:].zfill(4)
        # Calculate parity bit
        parity = str((int(code[0]) ^ int(code[1]) ^ int(code[2]) ^ int(code[3])))
        # Append parity bit and convert to integer
        code_int = int(code + parity, 2)
        codes.append(code_int)
    # Remove codes with less than 4 ones
    codes = [code for code in codes if bin(code).count('1') == 4]
    # Calculate Hamming distance between all pairs of codes
    hamming_dist = [[bin(x ^ y).count('1') for y in codes] for x in codes]
    # Check that minimum Hamming distance is 4
    while True:
        if min(min(row) for row in hamming_dist) >= 4:
            break
        else:
            # If minimum Hamming distance is less than 4, remove a code and try again
            codes.remove(codes[hamming_dist.index(min(hamming_dist, key=min))])
            hamming_dist = [[bin(x ^ y).count('1') for y in codes] for x in codes]
    # Return the first seq_num codes
    return codes[:seq_num]

In [104]:
def hamming_code(seq_num:int,encoding_scheme:str):
    # initialize an empty list to store the codes
    codes = []
    pass_sequences = []
    if encoding_scheme == "MHD4":
        # loop through all possible combinations of 16 bits
        for i in range(2**16):
            # convert the integer i to a binary string of length 16
            bin_str = format(i, '016b')
            # count the number of ones in the binary string
            num_ones = bin_str.count('1')
            # if the number of ones is exactly 4
            if num_ones == 4:
                # append the binary string to the codes list
                codes.append(bin_str)
        # loop through all pairs of codes in the list

        for i in range(len(codes)):
            for j in range(i+1, len(codes)):

                if len(pass_sequences) == 140 or len(pass_sequences) == seq_num:
                    break

                # compute the Hamming distance between two codes
                ham_dist = sum(a != b for a, b in zip(codes[i], codes[j]))
                # if the Hamming distance is less than 4
                if ham_dist >= 4:
                    if len(pass_sequences) == 0:
                        pass_sequences.append(codes[i])
                    # remove one of the codes from the list
                    else:
                        ham_list = []
                        for pass_seq in pass_sequences:
                            ham_dist = sum(a != b for a, b in zip(pass_seq, codes[j]))
                            ham_list.append(ham_dist)

                        if all(4 <= ham for ham in ham_list):
                            pass_sequences.append(codes[j])
                            print(len(pass_sequences))
                            break
            if len(pass_sequences) == 140 or len(pass_sequences) == seq_num:
                break
                    # break out of the inner loop

    if encoding_scheme == "MHD2":
                # loop through all possible combinations of 14 bits
        for i in range(2**14):
            # convert the integer i to a binary string of length 16
            bin_str = format(i, '016b')
            # count the number of ones in the binary string
            num_ones = bin_str.count('1')
            # if the number of ones is exactly 4
            if num_ones == 4:
                # append the binary string to the codes list
                codes.append(bin_str)
        # loop through all pairs of codes in the list

        for i in range(len(codes)):
            for j in range(i+1, len(codes)):

                if len(pass_sequences) == 1001 or len(pass_sequences) == seq_num:
                    break
                # compute the Hamming distance between two codes
                ham_dist = sum(a != b for a, b in zip(codes[i], codes[j]))
                # if the Hamming distance is less than 2
                if ham_dist >= 2:
                    if len(pass_sequences) == 0:
                        pass_sequences.append(codes[i])
                    # remove one of the codes from the list
                    else:
                        ham_list = []
                        for pass_seq in pass_sequences:
                            ham_dist = sum(a != b for a, b in zip(pass_seq, codes[j]))
                            ham_list.append(ham_dist)
                        # check passed sequences
                        if all(2 <= ham for ham in ham_list):
                            pass_sequences.append(codes[j])
                            print(len(pass_sequences))
                            break
                    # break out of the inner loop
            if len(pass_sequences) == 1001 or len(pass_sequences) == seq_num:
                break
    return pass_sequences

In [118]:
#x = hamming_code(140,"MHD2")
x[1]

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140


'0000000000011011'

In [67]:
x[10]

'0000000010100110'

In [117]:
import random

from scipy.spatial.distance import hamming
#num = len(hamming_code())
#x = hamming_code()
def bits_to_list(bit_string):
    bit_list = [int(bit) for bit in bit_string]
    return bit_list

random_num = random.sample(range(140),4)
seq1 = bits_to_list(x[random_num[0]])
seq2 = bits_to_list(x[random_num[1]])
seq3 = bits_to_list(x[random_num[2]])
seq4 = bits_to_list(x[random_num[3]])
print(hamming(seq1, seq2)*len(seq1))
print(hamming(seq1, seq3)*len(seq1))
print(hamming(seq1, seq4)*len(seq1))
print(hamming(seq2, seq3)*len(seq1))
print(hamming(seq2, seq4)*len(seq1))
print(hamming(seq3, seq4)*len(seq1))

6.0
2.0
8.0
6.0
4.0
6.0
