## Hashing

In [5]:
## Define some function useful for testing
import random

## generate an array of n random integers up to b
def get_random_array(n, b = 50):
    return [random.randint(0, b) for _ in range(n)]

---

### Open Addressing with linear probing

[Open addressing](https://en.wikipedia.org/wiki/Open_addressing) is a collision resolution technique used for handling collisions in hashing. 

All the items are stored in a table of size $\alpha n$, where $n$ is the number of keys and $\alpha > 1$ is the load factor.

Initially, the table contains only a special value ```None``` which says that the entry is empty. Another 
special value, say character ```'D'``` is used to mark a entry that contained a key that has been deleted.

A hash functon $h()$ is used to specify the order of entries to probe for a key to be inserted/searched/deleted. 
We start by probing $h(k)$ and, with linear probing, the sequence of probes $S(k)$ is $h(k), h(k)+1, h(k)+2, \ldots$ , modulo $\alpha n$.

- **Insert** adds the key in the first empty slot that we found with positions in $S(k)$.
- **Lookup** is performed by checking positions in $S(k)$ until we find either the key or ```None```.
- **Delete** is performed by first sesrching the key and then by replacing it with ```'D'```. Why don't we use ```None``` instead? 


![alt text](LinearProbing.jpg "Example")

### Exercise: Open Addressing with linear probing
Complete the implementation below by implementing ```Lookup```and ```Delete```.


**Optional:** Try to implement quadratic probing. This is the technique employed by Python's set and dictionary.  

In [6]:
## Your implementation goes here

#OPEN ADDRESSING with linear probing uses the following hash function:
#h(k,i) = ((h'(k) + i) % m)

class linear_probing_set:
    def __init__(self, size):
        
        self.T = [None]*size
        self.prime = 993319
        #a,b,prime must be fixed
        self.a = random.randint(2, self.prime-1)
        self.b = random.randint(2, self.prime-1)
        self.n_keys = 0  #n of already inserted keys
        
    def insert(self, key):
        if len(self.T) < self.n_keys:
            raise ('Full Table')
        if self.lookup(key): #check if key is already in the table. If yes: exit
            return
        #this takes more time than other loop: it skips 'D'
        #lookup must reach the first none, while insertion stops at first 'D'
        #could merge the two fieces of code together
        h = self.hash(key)
        while self.T[h] != None and self.T[h] != 'D':#look for empty entry
                h += 1
                if h == len(self.T):
                    h = 0
        self.T[h] = key
        self.n_keys += 1
    
    # Return True (not the key itself) if key is in the set, False otherwise
    def lookup(self, key):
        # TODO
        h = (self.hash(key))  #hash function
        #sia perché mi basta fare la hash function +i = +1 ogni volta
        count = 0             #probes
        while  self.T[h] != None:
            if self.T[h] == key:
                return True
            h+=1  #increment the probe, if current slot is occupied
            count+=1
            
            # toglierei tutta questa roba
            if h == len(self.T):# next position is first one
                h = 0  
            
            if count == len(self.T):  #if i == m 
                return False
        
        return False  #if self.T[h] == None (NIL)
            
        
    
    def delete(self, key):
        # TODO
        h = (self.hash(key))  
        count = 0
        while  self.T[h] != None:  #simulate a search (or lookup)
            if self.T[h] == key: #if i find the key, i set it == 'D'
                self.T[h] = 'D'
                self.n_keys-=1  #decrement n. of keys
                #return h  #return J in pseudocode: return the position of deleted key
            
            h+=1   #increment the probe
            count+= 1 # keep track of probes
            
            if h == len(self.T):
                h = 0
            
            if count == len(self.T):  #i could also write h == ..
                return False
        return False
        
    
    def hash(self, key):  #regular hash function
        return ((self.a*key + self.b) % self.prime) % len(self.T)
    
    def len(self):
        return self.n_keys

In [7]:
## Test your implementation

n = 10000

a = get_random_array(n, n)

queries = get_random_array(n, n)

lp_set = linear_probing_set(2*n)
std_set = set()

for key in a:
    lp_set.insert(key)
    std_set.add(key)

assert len(std_set) == lp_set.len(), "Fail len!"     
    
for key in a:
    assert lp_set.lookup(key) == True, "Lookup fail a"

for key in queries:
    assert lp_set.lookup(key) == (key in std_set), "Lookup fail queries"
    
for key in a[:300]:
    lp_set.delete(key)
    try:
        std_set.remove(key)
    except:
        pass # the key has been already removed
          
    assert lp_set.lookup(key) == (key in std_set), "Lookup fail delete"   

If we end space in our table, we double the size of the table (create a new table with twice the n of entries) and start removing elements in out set into new table. this is done scnaning old table and try inserting them into new table.
By doubling the space, we ansure that we don't spend too much time into rebuilding hash table. This is because we do a rebuilding every time we double n of elements in the table.
This approach is used in any implementation of dynamic arrays in which we append at the end.

In [14]:
%timeit for key in queries: lp_set.lookup(key)
    
%timeit for key in queries: key in std_set

31.5 ms ± 3.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
3.5 ms ± 586 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


----
### Hashing with Chains
Instead of just storing the elements in the slots in the table $T$, let every slot be a list which contains all the elements which are in the table and map to that slot. Our operations now become:

- `Insert` $(k)$: hash $k$ to an index $i$ in the table. You may want to check if $k$ is already in the set first.
- `Lookup` $(k)$: search for $k$ in the list by iterating through all the list.
- `Delete` $(k)$: search for $k$ and then remove it from the list.

Lookup and Delete takes $O(s)$ time where $s$ is the size of the list. We define $\alpha = \frac{n}{m}$ as the **load factor**. If we assume simple uniform hashing, then each element has equal probability to go into any slot. So after $n$ independent elements have been inserted we have and expected length of $\frac{n}{m} = \alpha$ for each chain by linearity of expectation. So the run time of all the above operations is time to hash + time to do these operations which is $O(1 + \alpha)$.

![alt text](Chaining.gif "Example")

### Exercise: Hashing with Chains
Complete the implementation below by implementing ```Lookup``` and ```Delete```.

In [17]:
l = [1,2,3,4]
l.pop(0)
l

[2, 3, 4]

In [1]:
## Your implementation goes here

class chaining_set:
    def __init__(self, size):
        
        self.T = []
        for _ in range(size):
            self.T.append([]) 
        ## why not self.T = [[]] * size ? answer:this is a list of lists,
        #invece noi 
        #vogliamo una lista per ogni slot
            
        self.prime = 993319
        self.a = random.randint(2, self.prime-1)
        self.b = random.randint(2, self.prime-1)
        self.n_keys = 0
        
    def insert(self, key):
        if self.lookup(key):  #check if k is already in the set
            return
        
        h = self.hash(key)
        self.T[h].append(key)  #simply insert at the head or tail  --> linked lists
        self.n_keys += 1
    
    # return True if key is in the set, False otherwise
    def lookup(self, key):
        # TODO
        i = (self.hash(key))
        
        for K in self.T[i]: #scan lista in puntatore. This takes linear time in worst case
            if K == key:
                return True
        return False
        
      
        
    def delete(self, key):
        # TODO
        i = self.hash(key)
        for j  in range(len(self.T[i])):#inside of the list     10,11,12
            if self.T[i][j] == key:# i = slot (where list is), j = key (horizontal)
                self.T[i][j], self.T[i][-1] = self.T[i][-1], self.T[i][j]
                self.T[i].pop() ###########
                self.n_keys-=1
               
                return  #non trova la chiave da cancellare, it finishes the elements, or it find the key

            
    def hash(self, key):
        return ((self.a*key + self.b) % self.prime) % len(self.T)
    
    def len(self):
        return self.n_keys

In [9]:
a = [2,3,5,9]
ch = chaining_set(1000)
for key in a:
    ch.insert(key)
    

In [10]:
ch.delete(2)

In [14]:
L = [1,2,3,4]
L.pop()
L

[1, 2, 3]

In [26]:
## Test your implementation

n = 10000

a = get_random_array(n, n)

queries = get_random_array(n, n)

c_set = chaining_set(2*n)
std_set = set()

for key in a:
    c_set.insert(key)
    std_set.add(key)

assert len(std_set) == c_set.len(), "Fail len!"     
    
for key in a:
    assert c_set.lookup(key) == True, "Lookup fail a"
  
for key in queries:
    assert c_set.lookup(key) == (key in std_set), "Lookup fail queries"
    
for key in a[:300]:
    c_set.delete(key)
    try:
        std_set.remove(key)
    except:
        pass # the key has been already removed
          
    assert c_set.lookup(key) == (key in std_set), "Lookup fail delete"  

In [24]:
%timeit for key in queries: c_set.lookup(key)
    
%timeit for key in queries: key in std_set

25.6 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.83 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


----

### Exercise: Dictionary
Modify the previous code to implement a dictionary, i.e., store a value together with each key. 
You need to implement methods:
- ```Insert(key, value)```: insert the key with its value. If the key was already present, change its value;
- ```Delete(key)```: remove the key;
- ```Lookup(key)```: return True if the key is present, False otherwise;
- ```Value(key)```: return the value associated with the key. It returns None, if the key is not present.

I suggest to store pairs (key, value) within the lists.


**Optional**. 
Implement ```keys()```, ```values()```, and ```items()``` which allows you to iterate over keys, values, and pairs (key, value) respectively. You have to use ```yield``` to implement each generator.  

#### Utilizzando Hashing with chains

In [20]:
#todo
class dictionary:
    def __init__(self, size):
        self.T = []
        for _ in range(size):
            self.T.append([])
            
        self.prime = 993319
        self.a = random.randint(2, self.prime-1)
        self.b = random.randint(2, self.prime-1)
        self.n_keys = 0
        
    def Insert(self, key, value):
        h = self.hash(key)
        key_exists = False  
            
        for i in range(len(self.T[h])):
            if self.T[h][i][0] == key:
                key_exists = True  #ossia se già abbiamo la chiave (primo elemento della tupla), va sostituita
                self.T[h][i] = (key, value)
                break
                
        if  key_exists == False:
            self.T[h].append((key, value))
            self.n_keys+=1
        
    def Lookup(self, key):
        h = self.hash(key)
        for K, value in self.T[h]:  #('Gloria', 22)
            if key == K:
                return True
        return False
    
    def Delete(self, key):
        h = self.hash(key)
        for i in range(len(self.T[h])):
           
            if self.T[h][i][0] == key:  #access the key (first element)
                self.T[h][i], self.T[h][-1] = self.T[h][-1], self.T[h][i] #instead of popping here, otherwise the idnexes get changed
                self.T[h].pop()
                self.n_keys-= 1
                break #finiamo qui dopo pop 
                
    def Value(self, key):
        h = self.hash(key)
        for K, value in self.T[h]:  #tuple ('Gloria', 22)
            if K == key:
                return value
        return None
    
    def hash(self, key):
        return ((self.a*key + self.b) % self.prime) % len(self.T)
    
    def len(self):
        return self.n_keys

In [32]:
C = get_random_array(10,10)
V = get_random_array(10,100)
b = list(zip(C,V))
D = dictionary(200)
for key, value in b:
    D.Insert(key, value)

In [33]:
for key in C:
    assert D.Lookup(key) == True, "Lookup fail key"

In [25]:
D.Lookup(1)

False

In [3]:
a = list(zip((1,5,2),(6,3,7)))  #chiavi, valori
a

[(1, 6), (5, 3), (2, 7)]

In [26]:
## Write here some tests to test your implementation
## Test your implementation

n = 10000

chiavi = get_random_array(n, n)
valori = get_random_array(n, n)
a = list(zip(chiavi, valori))

kq = get_random_array(n,n)
vq = get_random_array(n,n)
query = list(zip(kq,vq))

dic= dictionary(2*n)
std_dic = {}

for key, value in a:
    dic.Insert(key, value)
    
for key, value in a:
    std_dic[key] = value

assert len(std_dic) == dic.len(), "Fail len!"     
    
for key in chiavi:
    assert dic.Lookup(key) == True, "Lookup fail key"

for key in kq:
    assert dic.Lookup(key) == (key in std_dic), "Lookup fail queries"
    
for key in chiavi[:300]:
    dic.Delete(key)
    std_dic.pop(key, None)
    assert dic.Lookup(key) == (key in std_dic), 'Lookup fail delete'
          
for key in kq:
    assert dic.Value(key) == std_dic.get(key), 'Value fail queries'