# Set Membership

The cell below defines two **abstract classes**: the first represents a set and basic insert/search operations on it. You will need to impement this API four times, to implement (1) sequential search, (2) binary search tree, (3) balanced search tree, and (4) bloom filter. The second defines the synthetic data generator you will need to implement as part of your experimental framework. <br><br>**Do NOT modify the next cell** - use the dedicated cells further below for your implementation instead. <br>

In [1]:
# DO NOT MODIFY THIS CELL

from abc import ABC, abstractmethod  

# abstract class to represent a set and its insert/search operations
class AbstractSet(ABC):
    
    # constructor
    @abstractmethod
    def __init__(self):
        pass           
        
    # inserts "element" in the set
    # returns "True" after successful insertion, "False" if the element is already in the set
    # element : str
    # inserted : bool
    @abstractmethod
    def insertElement(self, element):     
        inserted = False
        return inserted   
    
    # checks whether "element" is in the set
    # returns "True" if it is, "False" otherwise
    # element : str
    # found : bool
    @abstractmethod
    def searchElement(self, element):
        found = False
        return found    
    
    
    
# abstract class to represent a synthetic data generator
class AbstractTestDataGenerator(ABC):
    
    # constructor
    @abstractmethod
    def __init__(self):
        pass           
        
    # creates and returns a list of length "size" of strings
    # size : int
    # data : list<str>
    @abstractmethod
    def generateData(self, size):     
        data = [""]*size
        return data   


Use the cell below to define any auxiliary data structure and python function you may need. Leave the implementation of the main API to the next code cells instead.

In [2]:
# ADD AUXILIARY DATA STRUCTURE DEFINITIONS AND HELPER CODE HERE
from bitarray import bitarray
import timeit

class BinaryTree:

    def __init__(self, element):
        self.element = element
        self.left = None
        self.right = None

class Node():
    def __init__(self, key, color):
        self.key = key
        self.left = None 
        self.right = None 
        #黑色为false，红色为true
        self.color = color 
        
    def searchElement(self, element):     
        found = False
        if self.key == element:
            found = True
        elif element < self.key and self.left:
            return self.left.searchElement(element)
        elif element > self.key and self.right:
            return self.right.searchElement(element)
        return found    
    
    def insertElement(self, element):
        return self.put(self, element)
    
    def put(self, n, element):
        inserted = False
        if element == n.key:
            inserted = False
        elif element < n.key:
            if n.left is None:
                n.left = Node(element, True)
                inserted = True
            else:
                inserted, n.left = self.put(n.left, element)
        elif element > n.key:
            if n.right is None:
                n.right = Node(element, True)
                inserted = True
            else:
                inserted, n.right = self.put(n.right, element)

        if self.isRed(n.right) and self.isRed(n.left) == False:
            n = self.rotateLeft(n)
        if self.isRed(n.left) and self.isRed(n.left.left):
            n = self.rotateRight(n)
        if self.isRed(n.right) and self.isRed(n.left):
            n = self.flipColor(n)
        return inserted, n

    
    def rotateLeft(self, n):
        x = n.right
        n.right = x.left
        x.left = n
        x.color = n.color
        n.color = True
        return x

    def rotateRight(self, n):
        x = n.left
        n.left = x.right
        x.right = n
        x.color = n.color
        n.color = True
        return x

    def flipColor(self, n):
        n.color = True
        n.left.color = False
        n.right.color = False
        return n
        
    def isRed(self, n):
        if n is None:
            return False
        return n.color
    
def merge(a, b):
    i = 0 
    j = 0
    result = []
    while i < len(a) and j < len(b):
        if a[i] < b[j]:
            result.append(a[i])
            i += 1
        else:
            result.append(b[j])
            j += 1
    if i < len(a):
        for k in range(i, len(a)):
            result.append(a[k])
    else:
        for k in range(j, len(b)):
            result.append(b[k])
    return result

def mergesort(list):
    mid = len(list) // 2
    if mid == 0:
        return list
    return merge(mergesort(list[:mid]), mergesort(list[mid:])) 

def nearestLowerPrime(x):
    if(x < 2):
        return -1
    x += 1
    isPrime = [True]*x
    closestPrime = 2
    for i in range(2, x):
        if isPrime[i]:
            for j in range(2*i, x, i):
                isPrime[j] = False
            closestPrime = i
    return closestPrime

def measureTime(func, setImplementation, data):
    starting_time = timeit.default_timer()
    func(setImplementation, data)
    end_time = timeit.default_timer()
    return end_time - starting_time

def insertion(set_implementation, insert_data):
    if(type(set_implementation) == BloomFilterSet):
        set_implementation.setArraySize(len(insert_data))
    for i in insert_data:
        set_implementation.insertElement(i)
        
def search(set_implementation, search_data):
    for i in search_data:
        set_implementation.searchElement(i)
        

def searchChecker(set_implementation, search_data):
    inSet = []
    for i in search_data:
        inSet.append(set_implementation.searchElement(i))
    return inSet


def extractRealData(fileName, searchFile = False):
    data = []
    for line in open(fileName, 'r'):
        if searchFile:
            data.append(line.strip())
        else:
            data += line.split(" ")
    return data

def checkFpr(bloomFilter, insertData, searchData):
    correctImplementation = BalancedSearchTreeSet()
    insertion(correctImplementation, insertData)
    fpr = falsePositiveRate(searchChecker(bloomFilter, searchData), searchChecker(correctImplementation, searchData))
    print("False positive rate is: ", fpr)

#Tests time taken on different number of strings:
def generate_1():
    testing_size = [5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
    result = []
    data = TestDataGenerator().generateData(max(testing_size), 5, 10)
    for i in testing_size:
        result.append(data[:i])
    return result

def synthetic_1(set_implementation):
    for i in generate_1():
        if(type(set_implementation) == SequentialSearchSet):
            if len(i) > 100000:
                continue
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i), "at size", len(i))
        print("Time taken for search:", measureTime(search,set_implementation, i), "at size", len(i))
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)
            

#Tests time taken with different orders: ascending, descending, random
def generate_2():
    test_data_random = TestDataGenerator().generateData(100000, 5, 10)
    test_data_ascending = mergesort(test_data_random)
    test_data_descending = test_data_ascending[::-1]
    return [test_data_ascending, test_data_descending, test_data_random]

def synthetic_2(set_implementation):
    order = ["ascending order", "descending order", "random order"]
    j = 0
    for i in generate_2():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i), "at", order[j])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at", order[j])
        j += 1
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)


#Tests time taken with strings of different length

def generate_3():
    testing_size = [(3,8), (5,10), (10,15), (15,20), (20,25), (30,35), (40,45)]
    result = []
    for i in testing_size:
        result.append(TestDataGenerator().generateData(100000, i[0], i[1]))
    return result


def synthetic_3(set_implementation):
    testing_size = [(3,8), (5,10), (10,15), (15,20), (20,25), (30,35), (40,45)]
    j = 0
    for i in generate_3():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at string length", testing_size[j][0], "to", testing_size[j][1])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at string length", testing_size[j][0], "to", testing_size[j][1])
        j += 1
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)

#Tests time taken with different proportions of duplicates
def generate_4():
    duplicate_number = [0, 10000, 20000, 30000, 40000, 50000]
    non_duplicate_number = [100000, 80000, 60000, 40000, 20000, 0]
    data = TestDataGenerator().generateData(100000, 5, 10)
    dulplicate_data = TestDataGenerator().generateData(50000, 5, 10)
    result = []
    for i in range(len(duplicate_number)):
        combined = data[:non_duplicate_number[i]] + dulplicate_data[:duplicate_number[i]]* 2
        random.shuffle(combined)
        result.append(combined)
    return result

def synthetic_4(set_implementation):
    non_duplicate_number = [100000, 80000, 60000, 40000, 20000, 0]
    j = 0
    for i in generate_4():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at duplicate proportion", (100000 - non_duplicate_number[j])/100000)
        print("Time taken for search:", measureTime(search,set_implementation, i), "at duplicate proportion", (100000 - non_duplicate_number[j])/100000)
        j += 1
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)
        

#Tests time taken with different percentage of none values
def generate_5():
    empty_number = [0, 500, 1000, 5000, 10000, 15000, 20000, 30000, 40000]
    data = TestDataGenerator().generateData(100000, 5, 10)
    result = []
    for i in range(len(empty_number)):
        combined = data[:100000-empty_number[i]] + [None] * empty_number[i]
        random.shuffle(combined)
        result.append(combined)
    return result

def synthetic_5(set_implementation):
    empty_number = [0, 500, 1000, 5000, 10000, 15000, 20000, 30000, 40000]
    j = 0
    for i in generate_5():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at empty number", empty_number[j])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at empty number", empty_number[j])
        j += 1
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)
        
#Tests time taken in seaching values not in the set
def generate_6():
    absent_number = [0,20000,40000,60000,80000,100000]
    insert_data = TestDataGenerator().generateData(100000, 5, 10)
    search_data = TestDataGenerator().generateData(100000, 5, 10)
    result = []
    for i in range(len(absent_number)):
        combined = insert_data[:100000-absent_number[i]] + search_data[:absent_number[i]]
        random.shuffle(combined)
        result.append(combined)
    return insert_data, result

def synthetic_6(set_implementation):
    absent_number = [0,20000,40000,60000,80000,100000]
    j = 0
    insert, searches = generate_6()
    insertion(set_implementation, insert)
    for i in searches:
        print("Time taken for search:", measureTime(search,set_implementation, i), "at searching absent number", absent_number[j])
        j += 1
        if(type(set_implementation) == BloomFilterSet):
            checkFpr(set_implementation, i,i)

def blockSeperator():
    print('-'*88)    

def ceil(x):
    y = int(x)
    if y - x > 0:
        return y + 1
    else:
        return y
    
def falsePositiveRate(bfResults, correctResults ):
    correct = 0
    positive = 0
    for i in range(len(bfResults)):
        if bfResults[i]:
            positive += 1
            if bfResults[i] == correctResults[i]:
                correct += 1
    return (positive - correct) /positive 

Use the cell below to implement the requested API by means of **sequential search**.

In [3]:
class SequentialSearchSet(AbstractSet):
    
    def __init__(self):
        # ADD YOUR CODE HERE
        self.elements = []
        pass       
    
    def insertElement(self, element):
        inserted = False
        # ADD YOUR CODE HERE
        if self.searchElement(element) == False: #insert element if element is not in set
            self.elements.append(element)
            return True
        return inserted
    
    def searchElement(self, element):     
        found = False
        # ADD YOUR CODE HERE
        for i in self.elements:
            if i==element:
                result = True
                break
        return found   

Use the cell below to implement the requested API by means of **binary search tree**.

In [4]:
class BinarySearchTreeSet(AbstractSet):
    def __init__(self):
        self.root = None

    def insertElement(self, element):
        inserted = False

        new_node = BinaryTree(element)

        if self.root is None:
            self.root = new_node
            inserted = True
        else:
            currentNode = self.root
            while currentNode:
                if element == currentNode.element:
                    return False
                elif element < currentNode.element:
                    if currentNode.left:
                        currentNode = currentNode.left
                    else:
                        currentNode.left = new_node
                        inserted = True
                        break
                else:
                    if currentNode.right:
                        currentNode = currentNode.right
                    else:
                        currentNode.right = new_node
                        inserted = True
                        break

        return inserted

    def searchElement(self, element):
        found = False
        currentNode = self.root

        while currentNode:
            if element == currentNode.element:
                found = True
                break
            elif element < currentNode.element:
                currentNode = currentNode.left
            else:
                currentNode = currentNode.right

        return found

Use the cell below to implement the requested API by means of **balanced search tree**.

In [5]:
class BalancedSearchTreeSet(AbstractSet):
    def __init__(self):
        self.node = None
        
    def insertElement(self, element):
        inserted = False
        # ADD YOUR CODE HERE
        if element == None:
            return False
        if self.node == None:
            self.node = Node(element, True)
        inserted, self.node = self.node.insertElement(element) 
        return inserted

    def searchElement(self, element):     
        found = False
        # ADD YOUR CODE HERE
        if element == None:
            return False
        return self.node.searchElement(element)   

Use the cell below to implement the requested API by means of **bloom filter**.

In [6]:


class BloomFilterSet(AbstractSet):

    def setArraySize(self, n = 500000):
        self.n = n
        #Given optimal values with 1% estimated false positive rate, As math log function couldn't be used to 
        #to calculate array size at different false positive rates.
        self.m = nearestLowerPrime(int(ceil(n*4.60517018599)/1.12068498637))
        self.store = bitarray(self.m)
        self.store.setall(0)
        self.k = round((self.m/self.n)*0.69314718056)

    def __init__(self):
        # ADD YOUR CODE HERE
        self.setArraySize()
        pass    
    
    def polynomial_hashing(self, key):
        indices = []
        primes = [2, 3, 5, 7, 9, 11, 13, 17, 23, 29, 31, 37] 
        for i in primes[-(self.k):]:
            hashVal = 0 
            for j in range(len(key)):
                hashVal = ((i**j)*ord(key[j]) + hashVal) % self.m
            indices.append(hashVal)
        return indices

    def insertElement(self, element):
        inserted = True
        for i in self.polynomial_hashing(element):
            inserted = self.store[i] and inserted
            self.store[i] = 1
        return inserted

    def searchElement(self, element):
        found = True
        # ADD YOUR CODE HERE

        for i in self.polynomial_hashing(element):
            found = self.store[i] and found
        return found
  

Use the cell below to implement the **synthetic data generator** as part of your experimental framework.

In [7]:
import string
import random

class TestDataGenerator(AbstractTestDataGenerator):
    
    def __init__(self):
        # ADD YOUR CODE HERE
        pass           
    
    def generateData(self, size, m, n):     
        # ADD YOUR CODE HERE
        data = [""]*size
        for i in range(size):
            data[i] = ''.join(random.choices(string.ascii_letters, k = random.randint(m, n)))
        return data 



Use the cells below for the python code needed to **fully evaluate your implementations**, first on real data and subsequently on synthetic data (i.e., read data from test files / generate synthetic one, instantiate each of the 4 set implementations in turn, then thorouhgly experiment with insert/search operations and measure their performance).

In [8]:
import timeit

# ADD YOUR TEST CODE HERE TO WORK ON REAL DATA
#list of implementations 
implementations = [SequentialSearchSet, BinarySearchTreeSet, BalancedSearchTreeSet, BloomFilterSet]

def realDataTesting(setImplementation):
    fileNames = ["test1-mobydick.txt", "test2-warpeace.txt", "test3-dickens.txt"]
    searchData = extractRealData('test-search.txt', True)
    for fileName in fileNames: 
        curSetImplementation = setImplementation()
        insertData = extractRealData(fileName)
        insertionTime = measureTime(insertion, curSetImplementation, insertData)
        searchTime = measureTime(search, curSetImplementation, searchData)
        print("For ", fileName, " insertion takes ", insertionTime, " search takes ", searchTime)
        if(setImplementation == BloomFilterSet):
            checkFpr(curSetImplementation, insertData, searchData)
        
for implementation in implementations:
    blockSeperator()
    print('For ', implementation.__name__)
    realDataTesting(implementation)
    blockSeperator()
    


----------------------------------------------------------------------------------------
For  SequentialSearchSet


KeyboardInterrupt: 

In [None]:
import timeit

# ADD YOUR TEST CODE HERE TO WORK ON SYNTHETIC DATA
tests = [
    (synthetic_1, 'different number of strings'),
    (synthetic_2, 'different orders: ascending, descending, random'),
    (synthetic_3, 'strings of different lengths'),
    (synthetic_4, 'different proportions of duplicates'),
    (synthetic_6, 'search values that are not in the set')
]

"""tests is a list of tuples with test[0] being the testing function and test[1] being a string explaining 
what is being tested"""

for test in tests:
    for implementation in implementations:
        blockSeperator()
        print('Time taken with ', test[1], 'for ', implementation.__name__ , ':')
        test[0](implementation())
        blockSeperator()