In [1]:
import kagglehub
from time import time
import numpy as np
    
path = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
glove_path = path + "/glove.6B.300d.txt" 

In [None]:
import os
import tempfile
import random
import heapq
import shutil

def tag_lines_with_random_keys(input_path, temp_dir, lines_per_chunk=100000):
    chunk_files = []
    with open(input_path, 'r') as infile:
        while True:
            lines = []
            try:
                for _ in range(lines_per_chunk):
                    line = next(infile)
                    key = random.random()
                    lines.append((key, line))
            except StopIteration:
                pass

            if not lines:
                break

            # Sort by key before writing (optional for efficiency)
            lines.sort()
            temp_path = os.path.join(temp_dir, next(tempfile._get_candidate_names()) + ".txt")
            with open(temp_path, 'w') as f:
                for key, line in lines:
                    f.write(f"{key:.17f}\t{line}")
            chunk_files.append(temp_path)
    return chunk_files

def merge_sorted_chunks(chunk_files, output_path):
    def line_iter(file_path):
        with open(file_path, 'r') as f:
            for line in f:
                key_str, content = line.split('\t', 1)
                yield (float(key_str), content)

    # Use heapq.merge to perform an external merge sort
    with open(output_path, 'w') as outfile:
        for _, line in heapq.merge(*(line_iter(fp) for fp in chunk_files)):
            outfile.write(line)

def shuffle_large_file_external(input_path, output_path):
    with tempfile.TemporaryDirectory() as temp_dir:
        chunk_files = tag_lines_with_random_keys(input_path, temp_dir)
        merge_sorted_chunks(chunk_files, output_path)
        # temp_dir and its files are automatically cleaned up

# Example usage:

In [4]:
%%time
shuffle_large_file_external(glove_path, 'shuffled_output.txt')

CPU times: user 1.75 s, sys: 1.78 s, total: 3.54 s
Wall time: 3.79 s
