In [1]:
import pandas as pd
import os
import heapq



#### Split the file and sort the detached files

In [2]:
def split_file(input_file, chunk_size):
    chunks = []
    reader = pd.read_csv(input_file, chunksize=chunk_size)
    for i, chunk in enumerate(reader):
        chunk_file = f'temp_chunk_{i}.csv'
        chunk['trade date'] = pd.to_datetime(chunk['trade date'], format='%Y%m%d')
        chunk.sort_values(by=['ts code', 'trade date'], inplace=True)
        chunk.to_csv(chunk_file, index=False)
        chunks.append(chunk_file)
    return chunks



#### Merge the file and use the multiplexed merge sort method to sort it.

In [3]:
def merge_files(chunks, output_file):
    with open(output_file, 'w', encoding='utf-8') as fout:
        files = [open(chunk, 'r', encoding='utf-8') for chunk in chunks]
        headers = files[0].readline().strip()
        fout.write(headers + '\n')
        for f in files[1:]:
            f.readline()

        heap = []
        for i, f in enumerate(files):
            line = f.readline().strip()
            if line:
                ts_code, trade_date = line.split(',')[:2]
                heapq.heappush(heap, (ts_code, trade_date, i, line))

        while heap:
            _, _, i, line = heapq.heappop(heap)
            fout.write(line + '\n')
            line = files[i].readline().strip()
            if line:
                ts_code, trade_date = line.split(',')[:2]
                heapq.heappush(heap, (ts_code, trade_date, i, line))

        for f in files:
            f.close()


#### Create index

In [4]:

def create_index(output_file, index_file):
    with open(output_file, 'r', encoding='utf-8') as f_out, open(index_file, 'w', encoding='utf-8') as f_index:
        offset = 0
        last_code = None
        last_month = None
        headers = f_out.readline()
        offset += len(headers.encode('utf-8'))
        for line in f_out:
            current_code, trade_date = line.split(',')[0], line.split(',')[1][:6]
            if current_code != last_code or trade_date != last_month:
                f_index.write(f'{current_code},{trade_date},{offset}\n')
                last_code = current_code
                last_month = trade_date
            offset += len(line.encode('utf-8'))


#### integrated the function

In [5]:

def external_sort(input_file, output_file, index_file, chunk_size):
    chunks = split_file(input_file, chunk_size)
    merge_files(chunks, output_file)
    create_index(output_file, index_file)
    for chunk in chunks:
        os.remove(chunk)



In [6]:

input_file = 'input.csv'
output_file = 'output.txt'
index_file = 'index.txt'
chunk_size = 1000  

external_sort(input_file, output_file, index_file, chunk_size)

print("The Ex-Sort Process Has Done")

The Ex-Sort Process Has Done
