In [1]:
import datetime

In [2]:
import numpy as np
import pandas as pd

start_time = datetime.datetime.now()

size_million = 200 # Million Numbers

# Generate 2 million random 64-bit integers
random_numbers = np.random.randint(low=np.iinfo(np.int64).min,
                                   high=np.iinfo(np.int64).max,
                                   size= size_million * 1_000_000,
                                   dtype=np.int64)

# Convert the array to a DataFrame
df = pd.DataFrame(random_numbers, columns=['RandomNumber'])

# Save the DataFrame to a CSV file
df.to_csv('random_numbers.txt', index=False, header=False)

print(f"{size_million} million random 64-bit numbers have been generated and saved to 'random_numbers.txt'.")

end_time = datetime.datetime.now()

print(f"Total runtime of the program is {end_time - start_time}")


200 million random 64-bit numbers have been generated and saved to 'random_numbers.txt'.
Total runtime of the program is 0:02:31.721260


In [3]:
import heapq
import os

import datetime

start_time = datetime.datetime.now()

def external_merge_sort(input_file, output_file, run_size):
    # Read input_file in chunks (runs) and sort each run
    with open(input_file, 'r') as infile:
        runs = []
        while True:
            chunk = infile.read(run_size)
            if not chunk:
                break
            run = sorted(map(int, chunk.split()))  # Assuming integers, adapt as needed
            runs.append(run)

    # Merge the sorted runs
    with open(output_file, 'w') as outfile:
        heap = [(run[0], i, 0) for i, run in enumerate(runs)]
        heapq.heapify(heap)

        while heap:
            val, run_idx, idx = heapq.heappop(heap)
            outfile.write(f"{val}\n")

            if idx + 1 < len(runs[run_idx]):
                next_val = runs[run_idx][idx + 1]
                heapq.heappush(heap, (next_val, run_idx, idx + 1))

    print(f"Sorted data written to {output_file}")

# Example usage
input_file = 'random_numbers.txt'  # Replace with your input file
output_file = 'sorted_data.txt'  # Replace with your output file
run_size = 200 * 1024 * 1024  # 200 MB in bytes

external_merge_sort(input_file, output_file, run_size)

end_time = datetime.datetime.now()

print(f"Total runtime of the program is {end_time - start_time}")



Sorted data written to sorted_data.txt
Total runtime of the program is 0:06:29.808588
