# Использование merge sort для сортировки внешнего файла

In [2]:
import random
import struct
import os
import sys

sys.setrecursionlimit(5000)

In [3]:
FILENAME = 'numbers'
FILENAME_TMP = 'numbers_tmp'

## Вспомогательные функции

In [4]:
def generate_binary_file(filename, first, last, n):
    with open(filename, 'wb') as f:
        for i in range(n):
            f.write(struct.pack('H', random.randint(first, last)))

            
def get_binary_code(filename, ind_l=None, ind_r=None):
    with open(filename, 'rb') as fr:
        if ind_l is None and ind_r is None:
            return fr.read()
        if ind_l is None and ind_r is not None:
            return fr.read()[:ind_r*2]
        if ind_l is not None and ind_r is None:
            return fr.read()[ind_l*2:]
        return fr.read()[ind_l*2:ind_r*2]

            
def get_number_of_words(filename):
    return os.path.getsize(filename) // 2


def get_int(filename, index):
    with open(filename, 'rb') as fr:
        fr.seek(index*2)
        return struct.unpack('H', fr.read(2))[0]
    

def write_merged_data(filename_in, filename_out, ind_l):
    with open(filename_in, 'rb+') as fi, open(filename_out, 'rb') as fo:
        fi.seek(ind_l*2)
        fi.write(fo.read())

        
def put_int_into_binary_file(filename, number):
    with open(filename, 'ab') as f:
        f.write(convert_int(number))

        
def convert_int(number):
    return struct.pack('H', number)


def write_merged_data_array(filename, array, ind_l):
    with open(filename, 'rb+') as fi:
        fi.seek(ind_l*2)
        fi.write(struct.pack('H'*len(array), *array))
        

def get_ints(filename, ind_l, ind_r):
    with open(filename, 'rb') as fr:
        fr.seek(ind_l*2)
        data_b = fr.read((ind_r - ind_l)*2)
        return list(struct.unpack('H'*(ind_r - ind_l), data_b))

## Реализация quick sort

In [5]:
def sort_quick(array, randomized=False):
    sort_quick_helper(array, 0, len(array) - 1, randomized)
    

def sort_quick_helper(array, ind_l, ind_r, randomized):
    if ind_l >= ind_r:
        return 
    ind_c = partition(array, ind_l, ind_r, randomized)
    sort_quick_helper(array, ind_l, ind_c - 1, randomized)
    sort_quick_helper(array, ind_c + 1, ind_r, randomized)


def partition(array, ind_l, ind_r, randomized):
    if randomized:
        r = random.randint(ind_l, ind_r)
        array[ind_r], array[r] = array[r], array[ind_r]
    pivot = array[ind_r]
    i = ind_l - 1;
    for j in range(ind_l, ind_r+1):
        if array[j] <= pivot:
            i += 1;
            array[i], array[j] = array[j], array[i]
    return i

## Реализация merge sort

In [14]:
def merge_hdd(filename, ind_l, ind_c, ind_r):
    with open(FILENAME_TMP, 'wb') as fw:
        l = ind_l
        r = ind_c + 1
        int_l = get_int(FILENAME, l)
        int_r = get_int(FILENAME, r)
        flag_l = False
        flag_r = False
        while l <= ind_c and r <= ind_r:
            if flag_l:
                int_l = get_int(FILENAME, l)
                flag_l = False
            if flag_r:
                int_r = get_int(FILENAME, r)
                flag_r = False
            if int_l < int_r:
                fw.write(convert_int(int_l))
                l += 1
                flag_l = True
            else:
                fw.write(convert_int(int_r))
                r += 1
                flag_r = True
        while l <= ind_c:
            int_l = get_int(FILENAME, l)
            fw.write(convert_int(int_l))
            l += 1
        while r <= ind_r:
            int_r = get_int(FILENAME, r)
            fw.write(convert_int(int_r))
            r += 1
    write_merged_data(FILENAME, FILENAME_TMP, ind_l)
    os.remove(FILENAME_TMP)

    
def sort_merge_hdd_helper(filename, ind_l, ind_r, chunk_size):
    if ind_l >= ind_r:
        return
    if chunk_size and (ind_r - ind_l) < chunk_size:
        array = get_ints(filename, ind_l, ind_r)
        sort_quick(array, randomized=True)
        write_merged_data_array(filename, array, ind_l)
        return
    ind_c = ind_l + (ind_r - ind_l) // 2
    sort_merge_hdd_helper(filename, ind_l, ind_c, chunk_size)
    sort_merge_hdd_helper(filename, ind_c+1, ind_r, chunk_size)
    merge_hdd(filename, ind_l, ind_c, ind_r)

    
def sort_merge_hdd(filename, chunk_size=0):
    sort_merge_hdd_helper(filename, 0, get_number_of_words(filename) - 1, chunk_size)

## Оценка времени работы merge sort

In [7]:
FIRST = 0
LAST = 2**16 - 1
NUMBER = 100000

In [8]:
generate_binary_file(FILENAME, FIRST, LAST, NUMBER)
print(get_ints(FILENAME, 0, 10))

[16310, 44498, 17540, 49949, 60091, 39118, 28154, 20691, 30767, 53211]


In [9]:
%%time
sort_merge_hdd(FILENAME)

CPU times: user 11.6 s, sys: 7.78 s, total: 19.4 s
Wall time: 19.6 s


In [10]:
print(get_ints(FILENAME, NUMBER//2-7, NUMBER//2+7))

[32890, 32891, 32891, 32891, 32892, 32892, 32894, 32895, 32895, 32895, 32895, 32896, 32897, 32897]


In [11]:
os.remove(FILENAME)

## Оценка времени работы merge sort с использованием quick sort на малых массивах

In [18]:
generate_binary_file(FILENAME, FIRST, LAST, NUMBER)
print(get_ints(FILENAME, 0, 10))

[37402, 50596, 58564, 7728, 47053, 4370, 1204, 33211, 7664, 4610]


In [19]:
%%time
sort_merge_hdd(FILENAME, chunk_size=128)

CPU times: user 5.36 s, sys: 1.98 s, total: 7.34 s
Wall time: 7.39 s


In [20]:
print(get_ints(FILENAME, NUMBER//2-7, NUMBER//2+7))

[33080, 33082, 33082, 33083, 33085, 33087, 33088, 33088, 33088, 33089, 33089, 33090, 33090, 33092]


In [21]:
os.remove(FILENAME)

## Выводы

При сортировке 100000 целых чисел от 0 до 65535, записанных в бинарный файл, комбинированный метод сортировки (merge + quick sort) показывает лучший результат по сравнению с чистым merge sort (7.39 s против 19.6 s). На малых массивах (до 128 чисел) сортировка происходит в памяти, что позволяет избавиться от большого числа операций чтения/записи на жесткий диск.