# Использование radix sort для сортировки внешнего файла

In [123]:
import random
import struct
import os
import shutil

In [4]:
FILENAME = 'numbers'
FILENAME_TMP = 'numbers_tmp'

## Вспомогательные функции

In [95]:
def generate_binary_file(filename, first, last, n):
    with open(filename, 'wb') as f:
        for i in range(n):
            f.write(struct.pack('H', random.randint(first, last)))
            
            
def generate_ints_from_file(filename):
    with open(filename, 'rb') as fr:
        while True:
            data = fr.read(2)
            if not data:
                return
            yield struct.unpack('H', data)[0]


def generate_ints_from_file_reversed(filename):
    with open(filename, 'rb') as fr:
        cursor = fr.seek(0, os.SEEK_END)
        while cursor > 0:
            cursor -= 2
            fr.seek(cursor)
            yield struct.unpack('H', fr.read(2))[0]
        

def convert_int(number):
    return struct.pack('H', number)


def get_ints(filename, ind_l, ind_r):
    with open(filename, 'rb') as fr:
        fr.seek(ind_l*2)
        data_b = fr.read((ind_r - ind_l)*2)
        return list(struct.unpack('H'*(ind_r - ind_l), data_b))

## Реализация radix sort

In [14]:
def sort_radix(array, base, max_exp):
    for exp in range(0, max_exp+1):
        sort_radix_helper(array, base, exp)

def sort_radix_helper(array, base, exp):
    # создаем пустой массив размера base для подсчета количества встретившихся цифр
    counter = [0 for i in range(base)]
    for number in array:
        counter[get_digit(number, base, exp)] += 1
    for i in range(1, base):
        counter[i] += counter[i - 1] 
    tmp_array = [0 for i in range(len(array))]
    for i in range(len(array)-1, -1, -1):
        counter[get_digit(array[i], base, exp)] -= 1
        tmp_array[counter[get_digit(array[i], base, exp)]] = array[i] 
    for i in range(0, len(array)):
        array[i] = tmp_array[i]
        
def get_digit(number, base, exp):
    return number // base**exp % base

## Реализация radix sort для сортировки файла

In [82]:
def sort_radix_hdd(filename, base, max_exp):
    for exp in range(0, max_exp+1):
        sort_radix_hdd_helper(filename, base, exp)

def sort_radix_hdd_helper(filename, base, exp):
    counter = [0 for i in range(base)]
    for number in generate_ints(filename):
        counter[get_digit(number, base, exp)] += 1
    for i in range(1, base):
        counter[i] += counter[i - 1]
    #вынести в функцию
    with open(FILENAME_TMP, 'wb') as fw:
        for number in generate_ints_from_file_reversed(filename):
            counter[get_digit(number, base, exp)] -= 1
            fw.seek(counter[get_digit(number, base, exp)] * 2)
            fw.write(convert_int(number))
    shutil.copyfile(FILENAME_TMP, filename)
    os.remove(FILENAME_TMP)

## Оценка времени работы radix sort для сортировки файла

In [118]:
FIRST = 0
LAST = 2**16 - 1
NUMBER = 100000

In [119]:
generate_binary_file(FILENAME, FIRST, LAST, NUMBER)
print(get_ints(FILENAME, 0, 10))

[12426, 49609, 61071, 26123, 39596, 1084, 6041, 22627, 25043, 32902]


In [120]:
%%time
sort_radix_hdd(FILENAME, 10, 5)

CPU times: user 2.36 s, sys: 817 ms, total: 3.18 s
Wall time: 3.18 s


In [121]:
print(get_ints(FILENAME, NUMBER//2-7, NUMBER//2+7))

[32752, 32752, 32753, 32754, 32755, 32755, 32755, 32755, 32756, 32759, 32759, 32759, 32760, 32760]


In [122]:
os.remove(FILENAME)

## Выводы

При сортировке 100000 целых чисел от 0 до 65535, записанных в бинарный файл, поразрядный метод сортировки показывает лучший результат по сравнению с комбинированным merge sort (merge + quick sort): 3.18 s против 7.39 s.