# Project: Build an Encrypted Database

In this project, we will build an encrypted database.

In [1]:
import string
import torch as th

### String approach

In [2]:
char2index = {}
index2char = {}

In [3]:
# supported characters
' ' + string.ascii_lowercase + '0123456789' + string.punctuation

' abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
# set characters
for n, char in enumerate(' ' + string.ascii_lowercase + '0123456789' + string.punctuation):
    char2index[char] = n
    index2char[n] = char

In [5]:
len(index2char)

69

In [6]:
str_input = 'Hello'
max_len = 8
# convert to lower case and set to max len
str_input = str_input[:max_len].lower()
# if str is too short, lets padding
if len(str_input) < max_len:
    str_input = str_input + '.' * (max_len - len(str_input))
# create the tensors now
values = list()
for char in str_input:
    values.append(char2index[char])
# convert values to long tensor
th.tensor(values).long()

tensor([ 8,  5, 12, 12, 15, 50, 50, 50])

In [7]:
values

[8, 5, 12, 12, 15, 50, 50, 50]

In [8]:
str_input

'hello...'

In [9]:
# Putting into a function
def string2values(str_input, max_len = 8):
    # convert to lower case and set to max len
    str_input = str_input[:max_len].lower()
    # if str is too short, lets padding
    if len(str_input) < max_len:
        str_input = str_input + '.' * (max_len - len(str_input))
    # create the tensors now
    values = list()
    for char in str_input:
        values.append(char2index[char])
    # convert values to long tensor
    return th.tensor(values).long()

In [10]:
string2values('Hello, there')

tensor([ 8,  5, 12, 12, 15, 48,  0, 20])

In [11]:
string2values('Hello, there, this is too large value')

tensor([ 8,  5, 12, 12, 15, 48,  0, 20])

### One Hot Representation

In [12]:
def one_hot(index, lenght):
    # create a zero vector of lenght <lenght>
    one_vector = th.zeros(lenght).long()
    # set the index value to 1
    one_vector[index] = 1
    return one_vector

In [13]:
one_hot(3, 5)

tensor([0, 0, 0, 1, 0])

In [14]:
# returns a one hot vector for the letter a
one_hot(char2index['a'], len(index2char))

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
# Putting into a function
def string2one_hot(str_input, max_len = 8):
    # convert to lower case and set to max len
    str_input = str_input[:max_len].lower()
    # if str is too short, lets padding
    if len(str_input) < max_len:
        str_input = str_input + '.' * (max_len - len(str_input))
    # create the tensors now
    values = list()
    for char in str_input:
        one_vector = one_hot(char2index[char], len(index2char)).unsqueeze(0) # add extra dimmension for easy concatenation later
        values.append(one_vector)
    # concatenate the vectors along the first dimmension
    return th.cat(values, dim = 0)

In [16]:
string2one_hot('Hello')

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 

In [17]:
# 8 rows, representing 8 characters
# 69 colums, representing the 69 values that each character could take on
string2one_hot('Hello').shape

torch.Size([8, 69])

In [18]:
# check if the values are the same
str_a = string2one_hot('Hello')
str_b = string2one_hot('Hello')

In [19]:
(str_a * str_b).sum() # get the total len

tensor(8)

In [20]:
# if they are different, the result will be different too
str_a = string2one_hot('Helloooo')
str_b = string2one_hot('Hellooo1')

In [21]:
(str_a * str_b).sum() # get the total len, seven letter in common

tensor(7)

In [22]:
# but if they are different
str_a = string2one_hot(" .......") # dots are common values
str_b = string2one_hot("Hello...")
(str_a * str_b).sum() # get the total len, three representations in common?

tensor(3)

In [23]:
for k in range(7):
    if (str_a[k] * str_b[k]).sum() > 0:
        print('a equal to b in {}'.format(k))

a equal to b in 5
a equal to b in 6


This is equal due to the added dots from
```{python}
str_input = str_input + '.' * (max_len - len(str_input))
```
Which add common characters (.). Therefore, the result is 3

In [24]:
# different values
str_a = string2one_hot('Helloooo')
str_b = string2one_hot('Helloox1')
(str_a * str_b).sum()

tensor(6)

In [25]:
# different values
str_a = string2one_hot('Helloooo')
str_b = string2one_hot('Hezzzzz1')
(str_a * str_b).sum() # only H and e are equal, they do not have a one in the same place.

tensor(2)

In [26]:
# other way, sum along the zero dimmension
# different values
str_a = string2one_hot('Helloooo')
str_b = string2one_hot('Hezzzzz1')
(str_a * str_b).sum() # only H and e are equal, they do not have a one in the same place.

tensor(2)

In [27]:
# which character overlap among the 8 dimmension
(str_a * str_b).sum(dim = 1)

tensor([1, 1, 0, 0, 0, 0, 0, 0])

In [28]:
vect = (str_a * str_b).sum(dim = 1)
vect

tensor([1, 1, 0, 0, 0, 0, 0, 0])

In [29]:
# check if the others also matched (completly)
# so the first one matchs
x = vect[0]
x

tensor(1)

In [30]:
for k in range(vect.shape[0] - 1): # skip the last element, since we are using x as the first
    x *= vect[k + 1] # select the next element, since we have x as the first

In [31]:
x

tensor(0)

In [32]:
key_match = x
key_match

tensor(0)

In [33]:
# putting the concepts togheter
str_a = string2one_hot('Hello')
str_b = string2one_hot('Hello')

vect = (str_a * str_b).sum(dim = 1)

x = vect[0]
for k in range(vect.shape[0] - 1): # skip the last element, since we are using x as the first
    x *= vect[k + 1] # select the next element, since we have x as the first
# boolean value, one bit: True, all the elments are equal | False: only some of none of the elements are equal  
key_match = x
key_match

tensor(1)

In [34]:
# Now, how to save elements in the DB
# We need keys and values, stored as numeric representations
keys = list()
values = list()
# use the hot encode for represent keys
keys.append(string2one_hot('key'))
# use the string approach to represent values
values.append(string2values('value'))

In [35]:
keys

[tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
values

[tensor([22,  1, 12, 21,  5, 50, 50, 50])]

In [37]:
# use the hot encode for represent keys
keys.append(string2one_hot('key1'))
keys.append(string2one_hot('key2'))
# use the string approach to represent values
values.append(string2values('value1'))
values.append(string2values('value2'))

In [38]:
values

[tensor([22,  1, 12, 21,  5, 50, 50, 50]),
 tensor([22,  1, 12, 21,  5, 28, 50, 50]),
 tensor([22,  1, 12, 21,  5, 29, 50, 50])]

In [39]:
# Now, we can add string comparison from before
def string_comparison(str_a, str_b):
    vect = (str_a * str_b).sum(dim = 1)

    x = vect[0]
    for k in range(vect.shape[0] - 1): # skip the last element, since we are using x as the first
        x *= vect[k + 1] # select the next element, since we have x as the first
    # boolean value, one bit: True, all the elments are equal | False: only some of none of the elements are equal  
    key_match = x
    return key_match

In [40]:
# Now, to query, we need the key values
query_1 = "key1"

In [41]:
# Now, convert key1 into a one hot representation
query_one_hot = string2one_hot(query_1)

In [42]:
query_one_hot

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [43]:
# Now, to perform the query, we need to know where the key matches with other keys
# check for all the keys
key_matches = list()
for key in keys:
    key_match = string_comparison(query_one_hot, key)
    key_matches.append(key_match)

In [44]:
# This matched the second key in the keys, this is correct
key_matches

[tensor(0), tensor(1), tensor(0)]

In [45]:
# now for the key0
query_1 = "key"
query_one_hot = string2one_hot(query_1)

key_matches = list()
for key in keys:
    key_match = string_comparison(query_one_hot, key)
    key_matches.append(key_match)

In [46]:
key_matches # ok

[tensor(1), tensor(0), tensor(0)]

In [47]:
# Then, we can use the results from key_matches to return the values we are looking for from the values dict
values

[tensor([22,  1, 12, 21,  5, 50, 50, 50]),
 tensor([22,  1, 12, 21,  5, 28, 50, 50]),
 tensor([22,  1, 12, 21,  5, 29, 50, 50])]

In [48]:
results = key_matches[0] * values[0]

for k in range(len(values) - 1):
    results += values[k + 1] * key_matches[k + 1]
results

tensor([22,  1, 12, 21,  5, 50, 50, 50])

In [49]:
values

[tensor([22,  1, 12, 21,  5, 50, 50, 50]),
 tensor([22,  1, 12, 21,  5, 28, 50, 50]),
 tensor([22,  1, 12, 21,  5, 29, 50, 50])]

In [50]:
# for other values
query_1 = "key2"
query_one_hot = string2one_hot(query_1)

key_matches = list()
for key in keys:
    key_match = string_comparison(query_one_hot, key)
    key_matches.append(key_match)
    
results = key_matches[0] * values[0]

for k in range(len(values) - 1):
    results += values[k + 1] * key_matches[k + 1]
results

tensor([22,  1, 12, 21,  5, 29, 50, 50])

In [51]:
values[2]

tensor([22,  1, 12, 21,  5, 29, 50, 50])

In [52]:
key_matches[0] * values[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0])

In [53]:
key_matches[1] * values[1]

tensor([0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
key_matches[2] * values[2]

tensor([22,  1, 12, 21,  5, 29, 50, 50])

In [55]:
# The values are been adding to the result variable, when they do not match, the result is zero. However
# when they match, we, get [1] * [vector values], which return the value for the key

In [56]:
# Now, what about using a funcion to do the inverse of string2values
def values2string(input_values):
    s = ''
    for value in input_values:
        s += index2char[int(value)]
    return s

In [57]:
results

tensor([22,  1, 12, 21,  5, 29, 50, 50])

In [58]:
values2string(results)

'value2..'

In [59]:
values2string(results).replace('.', '')

'value2'

In [60]:
# putting the concepts togheter
query_key = 'key1'
def query(query_key):
    # for other values
    query_one_hot = string2one_hot(query_key)

    key_matches = list()
    for key in keys:
        key_match = string_comparison(query_one_hot, key)
        key_matches.append(key_match)

    results = key_matches[0] * values[0]

    for k in range(len(values) - 1):
        results += values[k + 1] * key_matches[k + 1]
    # recover the string, remove the dots if padding
    results = values2string(results).replace('.', '')
    return results

In [61]:
query_key = 'key1'
query(query_key)

'value1'

In [62]:
query_key = 'key0' # this key does not exist
query(query_key)

'        '

In [63]:
query_key = 'key'
query(query_key)

'value'

In [64]:
query_key = 'key2'
query(query_key)

'value2'

This is the first EncryptedDB prototype.

In [65]:
# Now, lets make a class to order the functions
class EncryptedDB():
    def __init__(self, max_key_len = 8, max_value_len = 8):
        self.max_key = max_key_len
        self.max_value = max_value_len
        
        self.keys = list()
        self.values = list()
        
        self.keys.append(string2one_hot('key0'))
        self.values.append(string2values('value0'))
        
        self.keys.append(string2one_hot('key1'))
        self.values.append(string2values('value1'))
        
        self.keys.append(string2one_hot('key2'))
        self.values.append(string2values('value2'))
        
    def query(self, query_key):
        # for other values
        query_one_hot = string2one_hot(query_key)

        key_matches = list()
        for key in self.keys:
            key_match = string_comparison(query_one_hot, key)
            key_matches.append(key_match)

        results = key_matches[0] * self.values[0]

        for k in range(len(self.values) - 1):
            results += self.values[k + 1] * key_matches[k + 1]
        # recover the string, remove the dots if padding
        results = values2string(results).replace('.', '')
        return results

In [66]:
db = EncryptedDB()

In [67]:
db.query('key0')

'value0'

In [68]:
db.query('key1')

'value1'

In [69]:
db.query('key2')

'value2'

In [70]:
# Now, lets add one more functionality to our class
class EncryptedDB():
    def __init__(self, max_key_len = 8, max_value_len = 8):
        self.max_key = max_key_len
        self.max_value = max_value_len
        
        self.keys = list()
        self.values = list()
        
    def add(self, key, value):
        # check for duplicate keys
        for k in self.keys:
            assert (string2one_hot(key) * k).sum() < self.max_key, 'Key: {} is already in the BD'.format(key)
        self.keys.append(string2one_hot(key))
        self.values.append(string2values(value))
        
    def query(self, query_key):
        # for other values
        query_one_hot = string2one_hot(query_key)

        key_matches = list()
        for key in self.keys:
            key_match = string_comparison(query_one_hot, key)
            key_matches.append(key_match)

        results = key_matches[0] * self.values[0]

        for k in range(len(self.values) - 1):
            results += self.values[k + 1] * key_matches[k + 1]
        # recover the string, remove the dots if padding
        results = values2string(results).replace('.', '')
        return results

In [71]:
# A boolean DB
db = EncryptedDB()

In [72]:
db.add('key0', 'value0')
db.add('key1', 'value1')
db.add('key2', 'value2')
db.add('key3', 'value3')
db.add('key4', 'value4')

In [73]:
db.add('key5', 'value5')

In [74]:
# query
db.query('key0')

'value0'

In [75]:
db.query('key1')

'value1'

In [76]:
db.query('key2')

'value2'

In [77]:
db.query('key3')

'value3'

In [78]:
db.query('key4')

'value4'

In [79]:
db.query('key5')

'value5'

In [80]:
# Now, we can add string comparison from before
def string_comparison(str_a, str_b):
    vect = (str_a * str_b).sum(dim = 1)

    x = vect[0]
    for k in range(vect.shape[0] - 1): # skip the last element, since we are using x as the first
        # For the Encrypted DB, this line must be changed from x *= vect[k + 1] to x = x * vect[k + 1]
        # Otherwhise we will end up with a set of eight big tensors, which are not in the dictionary.
        x = x * vect[k + 1] # select the next element, since we have x as the first
    # boolean value, one bit: True, all the elments are equal | False: only some of none of the elements are equal  
    key_match = x
    return key_match

In [81]:
# However, this is not an encrypted DB. So lets make one
class EncryptedDB():
    # adding owners, which are workers
    def __init__(self, *owners, max_key_len = 8, max_value_len = 8):
        self.max_key = max_key_len
        self.max_value = max_value_len
        self.owners = owners
        self.keys = list()
        self.values = list()
        
    def add(self, key, value):
        key = string2one_hot(key)
        key = key.share(*self.owners)
        self.keys.append(key)
        
        value = string2values(value)
        value = value.share(*self.owners)
        self.values.append(value)
        
    def query(self, query_key):
        # for other values
        query_one_hot = string2one_hot(query_key)
        # share query
        query_one_hot = query_one_hot.share(*self.owners)
        key_matches = list()
        for key in self.keys:
            key_match = string_comparison(query_one_hot, key)
            key_matches.append(key_match)
        
        results = key_matches[0] * self.values[0]

        for k in range(len(self.values) - 1):
            results += self.values[k + 1] * key_matches[k + 1]
        # decrypt result
        results = results.get()
        
        print(results)
        # recover the string, remove the dots if padding
        results = values2string(results).replace('.', '')
        return results

In [86]:
import syft as sy
hook = sy.TorchHook(th)

In [None]:
# create workers
worker_1 = sy.VirtualWorker(hook = hook, id = 'worker_1').add_worker(sy.local_worker)
worker_2 = sy.VirtualWorker(hook = hook, id = 'worker_2').add_worker(sy.local_worker)
secure_worker = sy.VirtualWorker(hook = hook, id = 'secure_worker').add_worker(sy.local_worker)

In [88]:
db = EncryptedDB(worker_1, worker_2, secure_worker)

In [89]:
print(db)

<__main__.EncryptedDB object at 0x7f659813a0b8>


In [90]:
# adding entries
db.add('key0', 'value0')
db.add('key1', 'value1')
db.add('key2', 'value2')
db.add('key3', 'value3')
db.add('key4', 'value4')

In [91]:
# query
db.query('key1')

tensor([22,  1, 12, 21,  5, 28, 50, 50])


'value1'

In [92]:
db.query('key2')

tensor([22,  1, 12, 21,  5, 29, 50, 50])


'value2'

# Conclusion

In this project we implemented an encrypted database using an encryption mechanism. This mechanism implement an encoded and decoded process. This allowed us to represent the data (encoded) and to respond with the real value (decoded) when a query was perform. To encode the data, we apply two encoded process, one for the keys and the other for the values. In the keys, we apply one-hot representation. This will turn on a single value in the matrix, which represent the keys. Also, the values are encoded using a tensor representation, where each value correspond to a data dictionary representation. Finally, we used syft to add an encryption layer to our encoder-decoder representation. This allowed us to send the data to diverse workers, where each worker have their data encrypted. Then, we can query over the real values of the data base without revealing the values in the process.