# Set Membership

The cell below defines two **abstract classes**: the first represents a set and basic insert/search operations on it. You will need to impement this API four times, to implement (1) sequential search, (2) binary search tree, (3) balanced search tree, and (4) bloom filter. The second defines the synthetic data generator you will need to implement as part of your experimental framework. <br><br>**Do NOT modify the next cell** - use the dedicated cells further below for your implementation instead. <br>

In [1]:
# DO NOT MODIFY THIS CELL

from abc import ABC, abstractmethod  

# abstract class to represent a set and its insert/search operations
class AbstractSet(ABC):
    
    # constructor
    @abstractmethod
    def __init__(self):
        pass           
        
    # inserts "element" in the set
    # returns "True" after successful insertion, "False" if the element is already in the set
    # element : str
    # inserted : bool
    @abstractmethod
    def insertElement(self, element):     
        inserted = False
        return inserted   
    
    # checks whether "element" is in the set
    # returns "True" if it is, "False" otherwise
    # element : str
    # found : bool
    @abstractmethod
    def searchElement(self, element):
        found = False
        return found    
    
    
    
# abstract class to represent a synthetic data generator
class AbstractTestDataGenerator(ABC):
    
    # constructor
    @abstractmethod
    def __init__(self):
        pass           
        
    # creates and returns a list of length "size" of strings
    # size : int
    # data : list<str>
    @abstractmethod
    def generateData(self, size):     
        data = [""]*size
        return data   


Use the cell below to define any auxiliary data structure and python function you may need. Leave the implementation of the main API to the next code cells instead.

In [2]:
# ADD AUXILIARY DATA STRUCTURE DEFINITIONS AND HELPER CODE HERE

class Node():
    def __init__(self, key, color):
        self.key = key
        self.left = None 
        self.right = None 
        #黑色为false，红色为true
        self.color = color 
        
    def searchElement(self, element):     
        found = False
        if self.key == element:
            found = True
        elif element < self.key and self.left:
            return self.left.searchElement(element)
        elif element > self.key and self.right:
            return self.right.searchElement(element)
        return found    
    
    def insertElement(self, element):
        return self.put(self, element)
    
    def put(self, n, element):
        inserted = False
        if element == n.key:
            inserted = False
        elif element < n.key:
            if n.left is None:
                n.left = Node(element, True)
                inserted = True
            else:
                inserted, n.left = self.put(n.left, element)
        elif element > n.key:
            if n.right is None:
                n.right = Node(element, True)
                inserted = True
            else:
                inserted, n.right = self.put(n.right, element)

        if self.isRed(n.right) and self.isRed(n.left) == False:
            n = self.rotateLeft(n)
        if self.isRed(n.left) and self.isRed(n.left.left):
            n = self.rotateRight(n)
        if self.isRed(n.right) and self.isRed(n.left):
            n = self.flipColor(n)
        return inserted, n

    
    def rotateLeft(self, n):
        x = n.right
        n.right = x.left
        x.left = n
        x.color = n.color
        n.color = True
        return x

    def rotateRight(self, n):
        x = n.left
        n.left = x.right
        x.right = n
        x.color = n.color
        n.color = True
        return x

    def flipColor(self, n):
        n.color = True
        n.left.color = False
        n.right.color = False
        return n
        
    def isRed(self, n):
        if n is None:
            return False
        return n.color
    
def merge(a, b):
    i = 0 
    j = 0
    result = []
    while i < len(a) and j < len(b):
        if a[i] < b[j]:
            result.append(a[i])
            i += 1
        else:
            result.append(b[j])
            j += 1
    if i < len(a):
        for k in range(i, len(a)):
            result.append(a[k])
    else:
        for k in range(j, len(b)):
            result.append(b[k])
    return result

def mergesort(list):
    mid = len(list) // 2
    if mid == 0:
        return list
    return merge(mergesort(list[:mid]), mergesort(list[mid:]))

Use the cell below to implement the requested API by means of **balanced search tree**.

In [3]:
class BalancedSearchTreeSet(AbstractSet):
    def __init__(self):
        # ADD YOUR CODE HERE
        self.node = None
        
    def insertElement(self, element):
        inserted = False
        # ADD YOUR CODE HERE
        if element == None:
            return False
        if self.node == None:
            self.node = Node(element, True)
        inserted, self.node = self.node.insertElement(element) 
        return inserted

    def searchElement(self, element):     
        found = False
        # ADD YOUR CODE HERE
        if element == None:
            return False
        return self.node.searchElement(element) 

In [31]:
balanced = BalancedSearchTreeSet()
balanced.insertElement("R")
balanced.insertElement("L")
balanced.insertElement("W")
balanced.insertElement("F")
balanced.insertElement("P")
balanced.insertElement("T")
balanced.insertElement("Z")
balanced.insertElement("D")
balanced.insertElement("J")
balanced.insertElement("N")
balanced.insertElement("S")
balanced.insertElement("V")
balanced.insertElement("X")
balanced.insertElement("B")
balanced.insertElement("H")
balanced.insertElement("U")
balanced.insertElement("Y")
balanced.searchElement("J")

True

In [32]:
def printTree(tree):
    queue = []
    queue.append(tree.node)
    while len(queue) > 0:
        current = queue.pop(0)
        if current.color == True:
            color = "red"
        else:
            color = "black"
        print(current.key + color)
        if current.left != None:
            queue.append(current.left)
        if current.right != None:
            queue.append(current.right)

printTree(balanced)

Wred
Rred
Yblack
Lblack
Tblack
Xblack
Zblack
Fred
Pblack
Sblack
Vblack
Dblack
Jblack
Nred
Ured
Bred
Hred


Use the cell below to implement the **synthetic data generator** as part of your experimental framework.

In [14]:
import string
import random

class TestDataGenerator(AbstractTestDataGenerator):
    
    def __init__(self):
        # ADD YOUR CODE HERE
        
        
        pass           
        
    def generateData(self, size, m, n):     
        # ADD YOUR CODE HERE
        data = [""]*size
        for i in range(size):
            data[i] = ''.join(random.choices(string.ascii_letters, k = random.randint(m, n)))

        return data   

Use the cells below for the python code needed to **fully evaluate your implementations**, first on real data and subsequently on synthetic data (i.e., read data from test files / generate synthetic one, instantiate each of the 4 set implementations in turn, then thorouhgly experiment with insert/search operations and measure their performance).

In [10]:
import timeit
def real_testing(set_implementation):

    file_names = ["test1-mobydick.txt", "test2-warpeace.txt", "test3-dickens.txt"]

    for k in range(3):
        #insert:
        inserts = []
        with open(file_names[k] ,"r") as f:
            line = f.readline()
            inserts += line.split(" ")

        starting_time = timeit.default_timer()      
        current_time = timeit.default_timer()
        time_elapsed = current_time - starting_time
        print("Total insertion time for test " , (k + 1) , " is " + str(time_elapsed))
        #if(type(set_implementation) == BloomFilterSet):
            #print(set_implementation.unique)
        #search:
        searches = []
        for line in open("test-search.txt","r"):
            searches.append(line.strip())

        starting_time = timeit.default_timer()

        for i in range(len(searches)):
            set_implementation.searchElement(searches[i])

        current_time = timeit.default_timer()
        time_elapsed = current_time - starting_time
        print("Total search time for test " ,(k + 1), " is " + str(time_elapsed))

real_testing(BalancedSearchTreeSet())

Total insertion time for test  1  is 4.999999987376214e-07


AttributeError: 'NoneType' object has no attribute 'searchElement'

In [4]:
import timeit

# ADD YOUR TEST CODE HERE TO WORK ON REAL DATA
results1 = []
results2 = []
results3 = []


#for test1:
#insert:
inserts = []

with open("test1-mobydick.txt","r") as f:
    line = f.readline()
    inserts = line.split(" ")

starting_time = timeit.default_timer()    

balanced = BalancedSearchTreeSet()
for i in range(len(inserts)):
    balanced.insertElement(inserts[i])
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total insertion time for test 1 is " + str(time_elapsed))

#search:
searches = []
for line in open("test-search.txt","r"):   
    searches.append(line.strip())
    
starting_time = timeit.default_timer()    

for i in range(len(searches)):
    results1.append(balanced.searchElement(searches[i]))
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total search time for test 1 is " + str(time_elapsed))


#for test2:
#insert:
inserts = []

with open("test2-warpeace.txt","r") as f:
    line = f.readline()
    inserts = line.split(" ")

starting_time = timeit.default_timer()    

balanced = BalancedSearchTreeSet()
for i in range(len(inserts)):
    balanced.insertElement(inserts[i])
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total insertion time for test 2 is " + str(time_elapsed))

#search:    
starting_time = timeit.default_timer()    

for i in range(len(searches)):
    results2.append(balanced.searchElement(searches[i]))
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total search time for test 2 is " + str(time_elapsed))


#for test3:
#insert:
inserts = []

with open("test3-dickens.txt","r") as f:
    line = f.readline()
    inserts = line.split(" ")

starting_time = timeit.default_timer()    

balanced = BalancedSearchTreeSet()
for i in range(len(inserts)):
    balanced.insertElement(inserts[i])
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total insertion time for test 3 is " + str(time_elapsed))

#search:
starting_time = timeit.default_timer()    

for i in range(len(searches)):
    results3.append(balanced.searchElement(searches[i]))
    
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Total search time for test 3 is " + str(time_elapsed))

Total insertion time for test 1 is 3.3369836999999998
Total search time for test 1 is 0.00582690000000019
Total insertion time for test 2 is 8.9287507
Total search time for test 2 is 0.003632399999998981
Total insertion time for test 3 is 92.9043198
Total search time for test 3 is 0.00449109999999564


In [28]:
import timeit

# ADD YOUR TEST CODE HERE TO WORK ON SYNTHETIC DATA

def measureTime(func, setImplementation, data):
    starting_time = timeit.default_timer()
    func(setImplementation, data)
    end_time = timeit.default_timer()
    return end_time - starting_time

def insertion(set_implementation, insert_data):
    for i in insert_data:
        set_implementation.insertElement(i)
        
def search(set_implementation, search_data):
    for i in search_data:
        set_implementation.searchElement(i)

def searchChecker(set_implementation, search_data):
    inSet = []
    for i in search_data:
        insSet.append(set_implementation.searchElement(i))
    return inSet

def extractRealData(fileName, searchFile = False):
    data = []
    for line in open(fileName, 'r'):
        if searchFile:
            data.append(line.strip())
        else:
            data += line.split(" ")
    return data

def realDataTesting(setImplementation):
    fileNames = ["test1-mobydick.txt", "test2-warpeace.txt", "test3-dickens.txt"]
    searchData = extractRealData('test-search.txt', True)
    for fileName in fileNames: 
        curSetImplementation = setImplementation()
        insertData = extractRealData(fileName)
        insertionTime = measureTime(insertion, curSetImplementation, insertData)
        searchTime = measureTime(search, curSetImplementation, searchData)
        print("For ", fileName, " insertion takes ", insertionTime, " search takes ", searchTime)

In [29]:
def generate_1():
    testing_size = [5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
    result = []
    data = TestDataGenerator().generateData(max(testing_size), 5, 10)
    for i in testing_size:
        result.append(data[:i])
    return result

def synthetic_1(set_implementation):
    for i in generate_1():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i), "at size", len(i))
        print("Time taken for search:", measureTime(search,set_implementation, i), "at size", len(i))

synthetic_1(BalancedSearchTreeSet())

Time taken for insertion: 3.659999993033125e-05 at size 5
Time taken for search: 1.5699999948992627e-05 at size 5
Time taken for insertion: 5.6599999879836105e-05 at size 10
Time taken for search: 6.499999926745659e-06 at size 10
Time taken for insertion: 0.0001843999998527579 at size 50
Time taken for search: 0.000115299999833951 at size 50
Time taken for insertion: 0.0009592999999767926 at size 100
Time taken for search: 0.00024389999998675194 at size 100
Time taken for insertion: 0.028002900000046793 at size 500
Time taken for search: 0.0017995999996855971 at size 500
Time taken for insertion: 0.014938400000119145 at size 1000
Time taken for search: 0.0036774999998669955 at size 1000
Time taken for insertion: 0.11428329999989728 at size 5000
Time taken for search: 0.02628679999997985 at size 5000
Time taken for insertion: 0.20024069999999483 at size 10000
Time taken for search: 0.05633669999997437 at size 10000
Time taken for insertion: 1.2036536000000524 at size 50000
Time taken fo

In [32]:
def generate_2():
    test_data_random = TestDataGenerator().generateData(100000, 5, 10)
    test_data_ascending = mergesort(test_data_random)
    test_data_descending = test_data_ascending[::-1]
    return [test_data_ascending, test_data_descending, test_data_random]

def synthetic_2(set_implementation):
    order = ["ascending order", "descending order", "random order"]
    j = 0
    for i in generate_2():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i), "at", order[j])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at", order[j])
        j += 1

synthetic_2(BalancedSearchTreeSet())

Time taken for insertion: 2.3560067999997045 at ascending order
Time taken for search: 0.6324189999995724 at ascending order
Time taken for insertion: 1.9392656999998508 at descending order
Time taken for search: 0.6435301000001346 at descending order
Time taken for insertion: 2.2285413999998127 at random order
Time taken for search: 0.8486554000000979 at random order


In [35]:
def generate_3():
    testing_size = [(3,8), (5,10), (10,15), (15,20), (20,25), (30,35), (40,45)]
    result = []
    for i in testing_size:
        result.append(TestDataGenerator().generateData(100000, i[0], i[1]))
    return result


def synthetic_3(set_implementation):
    testing_size = [(3,8), (5,10), (10,15), (15,20), (20,25), (30,35), (40,45)]
    j = 0
    for i in generate_3():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at string length", testing_size[j][0], "to", testing_size[j][1])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at string length", testing_size[j][0], "to", testing_size[j][1])
        j += 1

synthetic_3(BalancedSearchTreeSet())

Time taken for insertion: 3.5078051999998934 at string length 3 to 8
Time taken for search: 0.8047687999996924 at string length 3 to 8
Time taken for insertion: 4.1586630000001605 at string length 5 to 10
Time taken for search: 0.9446138000002975 at string length 5 to 10
Time taken for insertion: 3.576387500000237 at string length 10 to 15
Time taken for search: 1.0447449000002962 at string length 10 to 15
Time taken for insertion: 4.746501499999795 at string length 15 to 20
Time taken for search: 1.0926849999996193 at string length 15 to 20
Time taken for insertion: 5.155908900000213 at string length 20 to 25
Time taken for search: 1.1525492999999187 at string length 20 to 25
Time taken for insertion: 3.909030800000437 at string length 30 to 35
Time taken for search: 1.1776056999997309 at string length 30 to 35
Time taken for insertion: 5.798247100000481 at string length 40 to 45
Time taken for search: 1.2310057000004235 at string length 40 to 45


In [44]:
import timeit

# ADD YOUR TEST CODE HERE TO WORK ON SYNTHETIC DATA

#number of strings:
testing_size = [5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]

for i in testing_size:
    test_data = TestDataGenerator().generateData(i, 5, 10)

    starting_time = timeit.default_timer()

    balanced = BalancedSearchTreeSet()
    for j in range(len(test_data)):
        balanced.insertElement(test_data[j])

    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for insertion:", time_elapsed, "at size", i)
    
    starting_time = timeit.default_timer()
    for j in range(len(test_data)):
        balanced.searchElement(test_data[j])
    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for search:", time_elapsed, "at size", i)

Time taken for insertion: 0.36107500000071013 at size 5
Time taken for search: 9.699999282020144e-06 at size 5
Time taken for insertion: 4.489999992074445e-05 at size 10
Time taken for search: 8.400000297115184e-06 at size 10
Time taken for insertion: 0.00015810000149940606 at size 50
Time taken for search: 3.509999987727497e-05 at size 50
Time taken for insertion: 0.00031800000033399556 at size 100
Time taken for search: 7.509999886678997e-05 at size 100
Time taken for insertion: 0.0022647999994660495 at size 500
Time taken for search: 0.0005652999989251839 at size 500
Time taken for insertion: 0.005951499999355292 at size 1000
Time taken for search: 0.001432199998816941 at size 1000
Time taken for insertion: 0.04583949999869219 at size 5000
Time taken for search: 0.013842599999406957 at size 5000
Time taken for insertion: 0.08288830000128655 at size 10000
Time taken for search: 0.02064570000038657 at size 10000
Time taken for insertion: 0.5465089999997872 at size 50000
Time taken for

In [56]:
#order: ascending, descending, random

test_data_random = TestDataGenerator().generateData(100000, 5, 10)
test_data_ascending = mergesort(test_data_random)
test_data_descending = test_data_ascending[::-1]

#ascending:
starting_time = timeit.default_timer()

balanced = BalancedSearchTreeSet()
for j in range(len(test_data_ascending)):
    balanced.insertElement(test_data_ascending[j])

current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for insertion:", time_elapsed, "at ascending order")

starting_time = timeit.default_timer()
for j in range(len(test_data_ascending)):
    balanced.searchElement(test_data_ascending[j])
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for search:", time_elapsed, "at ascending order")

#descending:
starting_time = timeit.default_timer()

balanced = BalancedSearchTreeSet()
for j in range(len(test_data_descending)):
    balanced.insertElement(test_data_descending[j])

current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for insertion:", time_elapsed, "at descending order")

starting_time = timeit.default_timer()
for j in range(len(test_data_descending)):
    balanced.searchElement(test_data_descending[j])
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for search:", time_elapsed, "at descending order")

#random:
starting_time = timeit.default_timer()

balanced = BalancedSearchTreeSet()
for j in range(len(test_data_random)):
    balanced.insertElement(test_data_random[j])

current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for insertion:", time_elapsed, "at random order")

starting_time = timeit.default_timer()
for j in range(len(test_data_random)):
    balanced.searchElement(test_data_random[j])
current_time = timeit.default_timer()
time_elapsed = current_time - starting_time
print("Time taken for search:", time_elapsed, "at random order")

Time taken for insertion: 1.133578099999795 at ascending order
Time taken for search: 0.2619758000000729 at ascending order
Time taken for insertion: 1.5656036999989738 at descending order
Time taken for search: 0.23833829999966838 at descending order
Time taken for insertion: 1.2860968000004505 at random order
Time taken for search: 0.31537269999898854 at random order


In [60]:
#length of string
testing_size = [(3,8), (5,10), (10,15), (15,20), (20,25), (30,35), (40,45)]
for i in testing_size:
    test_data = TestDataGenerator().generateData(100000, i[0], i[1])

    starting_time = timeit.default_timer()

    balanced = BalancedSearchTreeSet()
    for j in range(len(test_data)):
        balanced.insertElement(test_data[j])

    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for insertion:", time_elapsed, "at string length", i[0], "to", i[1])
    
    starting_time = timeit.default_timer()
    for j in range(len(test_data)):
        balanced.searchElement(test_data[j])
    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for search:", time_elapsed,"at string length", i[0], "to", i[1])

Time taken for insertion: 1.4243232999997417 at string length 3 to 8
Time taken for search: 0.3729217999989487 at string length 3 to 8
Time taken for insertion: 1.391525000000911 at string length 5 to 10
Time taken for search: 0.42413289999967674 at string length 5 to 10
Time taken for insertion: 1.4904552000007243 at string length 10 to 15
Time taken for search: 0.36325530000067374 at string length 10 to 15
Time taken for insertion: 1.4334849000006216 at string length 15 to 20
Time taken for search: 0.3412225000010949 at string length 15 to 20
Time taken for insertion: 1.3633810999999696 at string length 20 to 25
Time taken for search: 0.3942956000009872 at string length 20 to 25
Time taken for insertion: 1.401602500000081 at string length 30 to 35
Time taken for search: 0.3967388999990362 at string length 30 to 35
Time taken for insertion: 1.4227866000001086 at string length 40 to 45
Time taken for search: 0.3346180999997159 at string length 40 to 45


In [51]:
def generate_4():
    duplicate_number = [0, 10000, 20000, 30000, 40000, 50000]
    non_duplicate_number = [100000, 80000, 60000, 40000, 20000, 0]
    data = TestDataGenerator().generateData(100000, 5, 10)
    dulplicate_data = TestDataGenerator().generateData(50000, 5, 10)
    result = []
    for i in range(len(duplicate_number)):
        combined = data[:non_duplicate_number[i]] + dulplicate_data[:duplicate_number[i]]* 2
        random.shuffle(combined)
        result.append(combined)
    return result

def synthetic_4(set_implementation):
    non_duplicate_number = [100000, 80000, 60000, 40000, 20000, 0]
    j = 0
    for i in generate_4():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at duplicate proportion", (100000 - non_duplicate_number[j])/100000)
        print("Time taken for search:", measureTime(search,set_implementation, i), "at duplicate proportion", (100000 - non_duplicate_number[j])/100000)
        j += 1
        
synthetic_4(BalancedSearchTreeSet())

Time taken for insertion: 1.6690393999888329 at duplicate proportion 0.0
Time taken for search: 0.40329910001310054 at duplicate proportion 0.0
Time taken for insertion: 1.3959518000046955 at duplicate proportion 0.2
Time taken for search: 0.41740459999709856 at duplicate proportion 0.2
Time taken for insertion: 1.1383894000027794 at duplicate proportion 0.4
Time taken for search: 0.39732329999969807 at duplicate proportion 0.4
Time taken for insertion: 1.2495500999939395 at duplicate proportion 0.6
Time taken for search: 0.4239730000117561 at duplicate proportion 0.6
Time taken for insertion: 1.3118593000108376 at duplicate proportion 0.8
Time taken for search: 0.4367964999983087 at duplicate proportion 0.8
Time taken for insertion: 1.305228299999726 at duplicate proportion 1.0
Time taken for search: 0.42186049999145325 at duplicate proportion 1.0


In [49]:
#duplicates
duplicate_number = [0, 10000, 20000, 30000, 40000, 50000]
non_duplicate_number = [100000, 80000, 60000, 40000, 20000, 0]
for i in range(len(duplicate_number)):
    test_non_duplicate = TestDataGenerator().generateData(non_duplicate_number[i], 5, 10)
    test_duplicate = TestDataGenerator().generateData(duplicate_number[i], 5, 10)
    test_data = test_non_duplicate + test_duplicate * 2
    
    starting_time = timeit.default_timer()

    balanced = BalancedSearchTreeSet()
    for j in range(len(test_data)):
        balanced.insertElement(test_data[j])

    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for insertion:", time_elapsed, "at duplicate proportion", (100000 - non_duplicate_number[i])/100000)
    
    starting_time = timeit.default_timer()
    for j in range(len(test_data)):
        balanced.searchElement(test_data[j])
    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for search:", time_elapsed, "at duplicate proportion", (100000 - non_duplicate_number[i])/100000)

Time taken for insertion: 1.4031410000025062 at duplicate proportion 0.0
Time taken for search: 0.4504865000053542 at duplicate proportion 0.0
Time taken for insertion: 1.3964582000044174 at duplicate proportion 0.2
Time taken for search: 0.43734910000057425 at duplicate proportion 0.2
Time taken for insertion: 1.3679262999939965 at duplicate proportion 0.4
Time taken for search: 0.5039976999978535 at duplicate proportion 0.4
Time taken for insertion: 1.2813783999881707 at duplicate proportion 0.6
Time taken for search: 0.3498793999897316 at duplicate proportion 0.6
Time taken for insertion: 1.107194600001094 at duplicate proportion 0.8
Time taken for search: 0.31726500000513624 at duplicate proportion 0.8
Time taken for insertion: 1.0589361999882385 at duplicate proportion 1.0
Time taken for search: 0.2871327999891946 at duplicate proportion 1.0


In [58]:
def generate_5():
    empty_number = [0, 500, 1000, 5000, 10000, 15000, 20000, 30000, 40000]
    data = TestDataGenerator().generateData(100000, 5, 10)
    result = []
    for i in range(len(empty_number)):
        combined = data[:100000-empty_number[i]] + [None] * empty_number[i]
        random.shuffle(combined)
        result.append(combined)
    return result

def synthetic_5(set_implementation):
    empty_number = [0, 500, 1000, 5000, 10000, 15000, 20000, 30000, 40000]
    j = 0
    for i in generate_5():
        print("Time taken for insertion:", measureTime(insertion,set_implementation, i),"at empty number", empty_number[j])
        print("Time taken for search:", measureTime(search,set_implementation, i), "at empty number", empty_number[j])
        j += 1
        
synthetic_5(BalancedSearchTreeSet())

Time taken for insertion: 1.7265172000043094 at empty number 0
Time taken for search: 0.3771192999993218 at empty number 0
Time taken for insertion: 1.1021353999967687 at empty number 500
Time taken for search: 0.3409304999950109 at empty number 500
Time taken for insertion: 1.1821151000040118 at empty number 1000
Time taken for search: 0.36767130000225734 at empty number 1000
Time taken for insertion: 1.1545545999979367 at empty number 5000
Time taken for search: 0.357310299994424 at empty number 5000
Time taken for insertion: 1.0581014000053983 at empty number 10000
Time taken for search: 0.33213909999176394 at empty number 10000
Time taken for insertion: 0.9598365000128979 at empty number 15000
Time taken for search: 0.3368396000005305 at empty number 15000
Time taken for insertion: 0.9509091999934753 at empty number 20000
Time taken for search: 0.2763939999858849 at empty number 20000
Time taken for insertion: 0.8259773000027053 at empty number 30000
Time taken for search: 0.279198

In [83]:
#None
empty_number = [0, 500, 1000, 5000, 10000, 15000, 20000, 30000, 40000]

for i in empty_number:
    test_valid = TestDataGenerator().generateData(100000 - i, 5, 10)
    test_none = [None] * i
    test_data = test_valid + test_none
    
    starting_time = timeit.default_timer()

    balanced = BalancedSearchTreeSet()
    for j in range(len(test_data)):
        balanced.insertElement(test_data[j])

    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for insertion:", time_elapsed, "at empty number", i)
    
    starting_time = timeit.default_timer()
    for j in range(len(test_data)):
        balanced.searchElement(test_data[j])
    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for search:", time_elapsed, "at empty number", i)

Time taken for insertion: 1.3876940999998624 at empty number 0
Time taken for search: 0.36697800000001735 at empty number 0
Time taken for insertion: 1.3590800999991188 at empty number 500
Time taken for search: 0.3874346000011428 at empty number 500
Time taken for insertion: 1.3350069999996776 at empty number 1000
Time taken for search: 0.36376449999988836 at empty number 1000
Time taken for insertion: 1.421497299999828 at empty number 5000
Time taken for search: 0.36156839999966905 at empty number 5000
Time taken for insertion: 1.3374588999995467 at empty number 10000
Time taken for search: 0.29519579999941925 at empty number 10000
Time taken for insertion: 1.2921132999999827 at empty number 15000
Time taken for search: 0.3726721000002726 at empty number 15000
Time taken for insertion: 1.1006239999987883 at empty number 20000
Time taken for search: 0.3415906000009272 at empty number 20000
Time taken for insertion: 1.0257333000008657 at empty number 30000
Time taken for search: 0.2501

In [66]:
def generate_6():
    absent_number = [0,20000,40000,60000,80000,100000]
    insert_data = TestDataGenerator().generateData(100000, 5, 10)
    search_data = TestDataGenerator().generateData(100000, 5, 10)
    result = []
    for i in range(len(absent_number)):
        combined = insert_data[:100000-absent_number[i]] + search_data[:absent_number[i]]
        random.shuffle(combined)
        result.append(combined)
    return insert_data, result

def synthetic_6(set_implementation):
    absent_number = [0,20000,40000,60000,80000,100000]
    j = 0
    insert, searches = generate_6()
    insertion(set_implementation, insert)
    for i in searches:
        print("Time taken for search:", measureTime(search,set_implementation, i), "at searching absent number", absent_number[j])
        j += 1
        
synthetic_6(BalancedSearchTreeSet())

Time taken for search: 0.3985714000009466 at searching absent number 0
Time taken for search: 0.35557550001249183 at searching absent number 20000
Time taken for search: 0.42080089999944903 at searching absent number 40000
Time taken for search: 0.40885409999464173 at searching absent number 60000
Time taken for search: 0.4885089000017615 at searching absent number 80000
Time taken for search: 0.47077129999524914 at searching absent number 100000


In [87]:
#seaching values not in the set
absent_number = [0,20000,40000,60000,80000,100000]
for i in absent_number:
    test_data = TestDataGenerator().generateData(100000, 5, 10)
    
    starting_time = timeit.default_timer()

    balanced = BalancedSearchTreeSet()
    for j in range(len(test_data)):
        balanced.insertElement(test_data[j])

    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for insertion:", time_elapsed, "at searching absent number", i)
    
    search_present = test_data[:100000 - i]
    search_absent = TestDataGenerator().generateData(i, 5, 10)
    search_data = search_present + search_absent
    
    starting_time = timeit.default_timer()
    for j in range(len(search_data)):
        balanced.searchElement(search_data[j])
    current_time = timeit.default_timer()
    time_elapsed = current_time - starting_time
    print("Time taken for search:", time_elapsed, "at searching absent number", i)

Time taken for insertion: 1.4431941999973787 at searching absent number 0
Time taken for search: 0.3878220999977202 at searching absent number 0
Time taken for insertion: 1.4022021999990102 at searching absent number 20000
Time taken for search: 0.39667449999979 at searching absent number 20000
Time taken for insertion: 1.5166523000007146 at searching absent number 40000
Time taken for search: 0.4367552000003343 at searching absent number 40000
Time taken for insertion: 1.5747274000023026 at searching absent number 60000
Time taken for search: 0.4541940999988583 at searching absent number 60000
Time taken for insertion: 1.4379979000004823 at searching absent number 80000
Time taken for search: 0.4271982999998727 at searching absent number 80000
Time taken for insertion: 1.4182080999999016 at searching absent number 100000
Time taken for search: 0.47585520000211545 at searching absent number 100000
