# Хеш-таблица (метод цепочек) с применением универсальных хеш-функций 

In [27]:
import random
import sys
from collections import namedtuple
from copy import deepcopy

from algorithms_py.trees import TreeBinary, TreeSplay

sys.setrecursionlimit(30000)

## Универсальные хеш-функции

In [2]:
def get_first_prime_after_number(number):
    n = number * 2 
    res = []
    primes = [True for i in range(n)]
    for i in range(2, n):
        if primes[i]:
            for k in range(i * i, n, i):
                primes[k] = False
    for i in range(number+1, len(primes)):
        if primes[i]:
            return i
        

class HashUniversal:
    
    def __init__(self, p, m):
        self.p = p
        self.m = m
        self.a = random.randint(1, self.p - 1)
        self.b = random.randint(0, self.p - 1)
        
    def get_hash_value(self, key):
        return (self.a * key + self.b) % self.p % self.m

## Реализация хеш-таблицы

In [21]:
Element = namedtuple('Element', ['key', 'value'])


class HashTable:
    
    def __init__(self, max_key=10000):
        self.m = 100
        self.p = get_first_prime_after_number(max_key)
        self.hash_func = HashUniversal(self.p, self.m)
        self.create_new_buckets()
                
    def create_new_buckets(self):
        self.buckets = [None for i in range(self.m)]
        
    def insert_element(self, key, value):
        index = self.hash_func.get_hash_value(key)
        chain = self.buckets[index]
        # если ячейка таблицы пуста
        if chain is None:
            # организуем связный список
            self.buckets[index] = [Element(key, value)]
        else:
            # если количество элементов связного списка превышает 32
            if len(chain) > 32 and isinstance(chain, list):
                # меняем список на бинарное дерево
                elements = self.buckets[index]
                self.buckets[index] = TreeBinary()
                for element in elements:
                    self.buckets[index].insert(element.key, element.value)
            else:
                self.buckets[index].append(Element(key, value))
            
    def insert(self, key, value):
        self.insert_element(key, value)
        if self.load_factor > 0.7:
            self.resize()
            
    def get_value(self, key):
        index = self.hash_func.get_hash_value(key)
        chain = self.buckets[index]
        if chain is None:
            raise KeyError
        if isinstance(chain, list):
            for element in self.buckets[index]:
                if element.key == key:
                    return element.value
        if isinstance(chain, TreeBinary):
            return chain.get_value(key)
                
    def remove(self, key):
        index = self.hash_func.get_hash_value(key)
        chain = self.buckets[index]
        if chain is None:
            raise KeyError
        if isinstance(chain, list):
            for element in chain:
                if element.key == key:
                    chain.remove(element)
                    if not chain:
                        self.buckets[index] = None
        if isinstance(chain, TreeBinary):
            chain.remove(key)
            if chain.is_empty():
                self.buckets[index] = None
                    
    @property
    def load_factor(self):
        return sum([self.buckets[i] is not None for i in range(self.m)]) / self.m
    
    def resize(self):
        old_table = deepcopy(self)
        self.m *= 2
        self.create_new_buckets()
        self.hash_func = HashUniversal(self.p, self.m)
        for element in old_table:
            self.insert_element(element.key, element.value)
        
    def __iter__(self):
        for bucket in self.buckets:
            if bucket is not None:
                for element in bucket:
                    yield element

## Тест работы основных операций

- 10000 элементов
- Добавление, поиск, удаление
- Проверяем при помощи питоновской реализации словаря

In [12]:
X = 999
N = 10000

elements = [(i, random.randint(0, X)) for i in range(N)]
random.shuffle(elements)

hash_table = HashTable(max_key=N)
py_dict = {}

for element in elements:
    hash_table.insert(*element)
    py_dict[element[0]] = element[1]

In [13]:
%%timeit -r 1 -n 1 -o -q

for k, v in py_dict.items():
    if v == hash_table.get_value(k):
        continue

<TimeitResult : 9 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [14]:
%%timeit -r 1 -n 1 -o -q

for k, v in py_dict.items():
    hash_table.remove(k)

<TimeitResult : 13.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

## Тест операции поиска в сравнении со splay-деревом

In [75]:
def generate_structures(N):
    elements = [(i, random.randint(0, 999)) for i in range(N)]
    random.shuffle(elements)
    tree_splay = TreeSplay()
    hash_table = HashTable(max_key=N)
    for k, v in elements:
        tree_splay.insert(k, v)
        hash_table.insert(k, v)
    return hash_table, tree_splay

- N - количество элементов хеш-таблицы/дерева
- M - количество случаных ключей в заданном диапазоне

### Тест 1

- N = 10**5
- M = 10**5
- Диапазон ключей: [0, N-1]

In [76]:
N = 10000
M = 10000
hash_table, tree_splay = generate_structures(N)
keys = [random.randint(0, N-1) for i in range(M)]

In [77]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    hash_table.get_value(key)

<TimeitResult : 9.95 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [78]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    tree_splay.get_value(key)

<TimeitResult : 210 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

### Тест 2

- N = 10**5
- M = 10**5
- Диапазон ключей: [0, N/10]

In [81]:
hash_table, tree_splay = generate_structures(N)
keys = [random.randint(0, int(N/10)) for i in range(M)]

In [82]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    hash_table.get_value(key)

<TimeitResult : 7.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [83]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    tree_splay.get_value(key)

<TimeitResult : 151 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

### Тест 3

- N = 10**5
- M = 10**5
- Диапазон ключей: [0, N/100]

In [84]:
hash_table, tree_splay = generate_structures(N)
keys = [random.randint(0, int(N/100)) for i in range(M)]

In [85]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    hash_table.get_value(key)

<TimeitResult : 6.92 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [86]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    tree_splay.get_value(key)

<TimeitResult : 89.9 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

### Тест 4

- N = 10**5
- M = 10**5
- Диапазон ключей: [0, N/1000]

In [87]:
hash_table, tree_splay = generate_structures(N)
keys = [random.randint(0, int(N/1000)) for i in range(M)]

In [88]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    hash_table.get_value(key)

<TimeitResult : 6.76 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [89]:
%%timeit -r 1 -n 1 -o -q

for key in keys:
    tree_splay.get_value(key)

<TimeitResult : 39.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

По времени выполнения операции поиска хеш-таблица превосходит splay-дерево (<10 ms), хотя последнее показывает уменьшение времени выполнения в случае, если запрашиваемые элементы часто повторяются (c 210 ms до 39 ms).