```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Annot/Treat_all_pos_errorChecking_error.txt 
'python3 /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_Treat_all_pos.py'
```

## Final Version

In [None]:
import csv
import os
import shutil
import glob
import random
import collections

def check_files_with_id(folder_path, checkid_file, output_folder):
    """
    Check the files in the given folder whose filenames contain the IDs in the specified files to the output folder.
    """
    id_set = set()

    with open(checkid_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                id_set.add(line[1:])

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(folder_path):
        if any(id_str in filename for id_str in id_set):
            shutil.copy(os.path.join(folder_path, filename), os.path.join(output_folder, filename))
            

def process_files(input_folder, output_folder):
    """
    Integrate and merge all data (in preparation for later sampling)
    """
    file_names = os.listdir(input_folder)

    # Create a new txt file to store the file content
    with open(os.path.join(output_folder, "Annot_RATIO.txt"), "wt") as output_file:
        # Create a csv writer object and set the delimiter as '\t'
        writer = csv.writer(output_file, delimiter='\t')
        # Write the column names to the output file
        writer.writerow(["ID", "Position", "AF_Ratio", "SB_Ratio"])

        # Loop through the first N files with the extension '.txt' in the input folder
        for i, file_name in enumerate(file_names):
            if file_name.endswith(".txt"):
                # Extract the file ID from the file name
                file_id = file_name.split("_")[0]
                # Open the file, skip the header, and read the Position and Ratio columns
                with open(os.path.join(input_folder, file_name), "r") as f:
                    file_lines = f.readlines()[1:]
                    # Loop through each line and write the ID, Position, and Ratio to the output file
                    for line in file_lines:
                        columns = line.strip().split("\t")
                        position = columns[0]
                        af = columns[3]
                        sb = columns[4]
                        writer.writerow([file_id, position, af, sb])

In [None]:
folder_path = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Decompress/'
checkid_file = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/TEST_50000/MAPLE0.3.2_rateVar_errors_realData_checkingErrors_50000_estimatedErrors.txt"
middle_output_folder = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/PLOT_FOR_Annot/'

output_folder = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Run the function
check_files_with_id(folder_path, checkid_file, middle_output_folder)
process_files(middle_output_folder, output_folder)

## Sampling
```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Annot/RS_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Annot/RS_For_Annot.py'
```

In [2]:
import random
import collections

def experiment_with_data(data_file, output_file):
    selected_data = collections.defaultdict(list) 
    ## <defaultdict object> to ensure that 
    ## the memory occupied by the old value will be reclaimed by the garbage collection mechanism.

    with open(data_file, 'r') as file:
        next(file)
        
        # ！！！！！！！！！！！
        line_count = 0  # 记录读取的行数
        # ！！！！！！！！！！！
        
        for line in file:
            
            # ！！！！！！！！！！！
            line_count += 1
            if line_count > 299030:
                break  # 停止读取数据
            # ！！！！！！！！！！！

            id_, position, af_ratio, sb_ratio = line.strip().split('\t')
            position = int(position)
            af_ratio = float(af_ratio)
            sb_ratio = float(sb_ratio)

            if len(selected_data[position]) < 5:
                selected_data[position].append((id_, af_ratio, sb_ratio))
            else:
                # Replace existing elements with a certain probability
                s = int(random.uniform(0, len(selected_data[position])))
                if s < 5:
                    selected_data[position][s] = (id_, af_ratio, sb_ratio)

    with open(output_file, 'w') as file:
        file.write("ID\tPosition\tAF_Ratio\tSB_Ratio\n")
        for position in selected_data:
            for id_, af_ratio, sb_ratio in selected_data[position]:
                file.write(f"{id_}\t{position}\t{af_ratio}\t{sb_ratio}\n")

    print("选择的数据已保存到文件：", output_file)



In [5]:
# 使用方法
data_file = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Annot_RATIO.txt'
output_file = 'selected_data_1.txt'
experiment_with_data(data_file, output_file)


选择的数据已保存到文件： selected_data_1.txt


***
***
```bash
bsub -M 200000 -e /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_sampling_all_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_sampling_all_pos.py'
```

In [5]:
import pandas as pd
import numpy as np

def merge_files(data_file, input_file, output_file, chunk_size=10000, max_rows=299030):
    """
    This function reads two input files and merges specific columns from the second file 
    into the first file based on matching ID and Position. The merged data is then saved to an output file.
    """
    # Initialize the dictionary to store AF and SB values
    af_sb_dict = {}

    # Read the second input file and store matching data
    with open(input_file, 'r') as file:
        next(file)  # Skip the title line
        for i, line in enumerate(file):
            if i >= max_rows:  # Limit the number of rows read
                break
            fields = line.strip().split('\t')
            id_position_str = '\t'.join(fields[:2])
            af_sb_dict[id_position_str] = fields[2:4]
    
    # Create a function to process a chunk of data
    def process_chunk(chunk):
        chunk['AF'] = np.nan
        chunk['SB'] = np.nan
        for index, row in chunk.iterrows():
            id_position_str = '\t'.join(map(str, [row['ID'], row['Position']]))
            if id_position_str in af_sb_dict:
                chunk.at[index, 'AF'], chunk.at[index, 'SB'] = af_sb_dict[id_position_str]
            else:
                chunk.at[index, 'AF'], chunk.at[index, 'SB'] = 'None', 'None'  # Fill in None for unmatched data
        return chunk

    # Process the first input file in chunks and write each processed chunk to the output file
    first_chunk = True
    for chunk in pd.read_csv(data_file, sep='\t', chunksize=chunk_size):
        processed_chunk = process_chunk(chunk)
        if first_chunk:
            processed_chunk.to_csv(output_file, sep='\t', index=False)
            first_chunk = False
        else:
            processed_chunk.to_csv(output_file, sep='\t', index=False, mode='a', header=False)


In [6]:
# Define the paths of your files
data_file = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/selected_data.txt'
input_file = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Annot_RATIO.txt'
output_file = 'test.txt'  # Replace with the actual path of the output file

# Call the function
merge_files(data_file, input_file, output_file)


这段代码是一个用于处理文件的循环。它使用了Pandas库来逐块读取一个输入文件（data_file），并将每个处理过的块写入一个输出文件（output_file）。

代码中的循环使用了pd.read_csv()函数来逐块读取输入文件。参数sep='\t'指定了文件的分隔符为制表符。chunksize参数确定了每个块的大小。

在循环中，每个块被传递给process_chunk()函数进行处理，并返回一个经过处理的块（processed_chunk）。

代码中的逻辑判断用于确定是将处理过的块写入输出文件作为新文件（first_chunk = True），还是将其追加到已存在的文件中（first_chunk = False）。

如果是第一个块（first_chunk = True），则调用processed_chunk.to_csv()将处理过的块写入输出文件。index=False参数指定不将行索引写入文件。

如果不是第一个块（first_chunk = False），则调用processed_chunk.to_csv()将处理过的块追加到输出文件中。mode='a'参数指定以追加模式打开文件，header=False参数指定不将列名写入文件。

这段代码的目的是逐块处理输入文件的内容并将处理结果写入输出文件，以便有效地处理大型数据集而不会消耗太多内存。