如果数据集过大并且不能一次性加载到内存中，那么在做采样的时候可以考虑使用更有效的方法。一种有效的方式是先进行文件级的采样，然后再进行行级的采样。
这个方法会更有效地处理大数据集，因为它一次只读取一个文件，并从中随机抽取行。然而，如果每个文件仍然过大并且不能完全加载到内存中，那么你可能需要使用更复杂的抽样方法，例如在每个文件中使用跳跃读取的方式。

In [13]:
import pandas as pd
import os
import random

def sample_rows_from_files(folder_path, output_file, total_rows):
    """
    Function to sample rows from multiple files within a folder
    """
    
    # Initialize an empty DataFrame to store the sampled results
    df_sampled = pd.DataFrame()

    # Read all files and store them in a list
    files_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") and filename.startswith("merged_"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, sep="\t")
            df['ID'] = filename.split("_")[1].split(".")[0]  # Extract ID from the filename
            files_data.append(df)

    # Concatenate all files into a single large DataFrame
    df_all = pd.concat(files_data)

    # Randomly sample the specified number of rows from the large DataFrame
    df_sampled = df_all.sample(n=total_rows, random_state=1)

    # Keep only the needed columns
    df_sampled = df_sampled[['ID', 'POS', 'REF', 'ALT', 'AF', 'nucleotide_martin']]

    # Save the results to a new txt file
    df_sampled.to_csv(output_file, sep="\t", index=False)

    return df_sampled


# Set the folder path and number of samples
folder_path = "/nfs/research/goldman/zihao/Datas/p1_errorsProject_P4/O_Folder_Temp/"
output_file = "sampled_data.txt"
total_rows = 50  # Total number of rows you wish to sample from all files

# Use the function to sample rows and save the output
sampled_df = sample_rows_from_files(folder_path, output_file, total_rows)

# Print the sampled DataFrame
sampled_df

Unnamed: 0,ID,POS,REF,ALT,AF,nucleotide_martin
11520,SRR20887848,27376,G,A,0.001755,G
66,SRR21042837,10198,C,T,0.999055,T
970,SRR19914213,18574,G,T,0.001382,G
107,SRR21802618,28271,A,T,0.916191,T
1521,SRR20860028,10752,T,A,0.016053,T
5066,SRR20887848,12189,T,A,0.001213,T
12592,SRR20887848,29597,T,A,0.004422,T
152,SRR20758642,2192,A,T,0.001093,A
9713,SRR20887848,22323,C,A,0.001785,C
4772,SRR20887848,11675,T,A,0.006673,T
