## 1_Extract_Sequence

```bash
bsub -q long -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Extract_Sequence_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1_Extract_Sequence.py'
```

```bash
sh /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/bash_extract_sequence.sh
```

```python
program path: /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1_Extract_Sequence_new.py
```

### For test
```python
import os
import gzip

base_path = "/nfs/research/zi/mhunt/Viridian_wf_paper/Vdn_all_ena/Reads/"

folders = ['D', 'S', 'E']

output_files = ["output_D.txt", "output_S.txt", "output_E.txt"]

def process_folder(folder_path, output_file):
    # Get the list of subfolders in the current folder
    subfolders = [subfolder for subfolder in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, subfolder))]
    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if subfolder == "vdn.v1.0.0":
            file_path = os.path.join(subfolder_path, "consensus.fa.gz")
            if os.path.exists(file_path):
                # Read the content of the gzipped file
                with gzip.open(file_path, 'rt') as f_in:
                    content = f_in.read()
                    # Write the content to the output file
                    with open(output_file, "a") as f_out:
                        f_out.write(content)
        else:
            # Recursively process the subfolder
            process_folder(subfolder_path, output_file)

for i, folder in enumerate(folders):
    output_file = output_files[i]
    with open(output_file, "w") as f_out:
        folder_path = os.path.join(base_path, folder)
        # Process the current folder
        process_folder(folder_path, output_file)
```

##### old version
```python
import os
import gzip

base_path = "/nfs/research/zi/mhunt/Viridian_wf_paper/Vdn_all_ena/Reads/"

folders = ['D', 'S', 'E']

output_files = ["/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D.txt", 
"/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S.txt", 
"/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E.txt"]

def process_folder(folder_path, output_file):
    try:
        # Get the list of subfolders in the current folder
        subfolders = [subfolder for subfolder in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, subfolder))]
    except PermissionError:
        print(f"Permission denied when trying to access the directory: {folder_path}")
        return  # Skip this directory if we don't have permission to access it
    
    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if subfolder == "vdn.v1.0.0":
            file_path = os.path.join(subfolder_path, "consensus.fa.gz")
            try:
                if os.path.exists(file_path):
                    # Read the content of the gzipped file
                    with gzip.open(file_path, 'rt') as f_in:
                        content = f_in.read()
                        # Write the content to the output file
                        with open(output_file, "a") as f_out:
                            f_out.write(content)
            except Exception as e:
                print(f"Error with file {file_path}: {e}")
        else:
            # Recursively process the subfolder
            process_folder(subfolder_path, output_file)

for i, folder in enumerate(folders):
    output_file = output_files[i]
    with open(output_file, "w") as f_out:
        folder_path = os.path.join(base_path, folder)
        # Process the current folder
        process_folder(folder_path, output_file)
```

In [None]:
import os
import gzip
import argparse

base_path = "/nfs/research/zi/mhunt/Viridian_wf_paper/Vdn_all_ena/Reads/"

output_files = {
    'D': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D_new.txt",
    'S': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S_new.txt",
    'E': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E_new.txt"
}

def process_folder(folder_path, output_file):
    try:
        subfolders = [subfolder for subfolder in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, subfolder))]
    except PermissionError:
        print(f"Permission denied when trying to access the directory: {folder_path}")
        return 

    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if subfolder == "vdn.v1.0.0":
            file_path = os.path.join(subfolder_path, "consensus.fa.gz")
            try:
                if os.path.exists(file_path):
                    with gzip.open(file_path, 'rt') as f_in:
                        content = f_in.read()
                        with open(output_file, "a") as f_out:
                            f_out.write(content)
            except Exception as e:
                print(f"Error with file {file_path}: {e}")
        else:
            process_folder(subfolder_path, output_file)

def main(folder):
    output_file = output_files[folder]
    with open(output_file, "w") as f_out:
        folder_path = os.path.join(base_path, folder)
        process_folder(folder_path, output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some files.")
    parser.add_argument('folder', choices=['D', 'S', 'E'], help="The folder to process")
    args = parser.parse_args()

    main(args.folder)


在递归扫描文件夹时依次进入子文件夹，并在进入 vdn.v1.0.0 文件夹后读取其中的 consensus.fa.gz 文件并写入文本

在这个更新后的示例代码中，我们使用了递归函数 process_folder，对于每个文件夹，我们首先列出其中的子文件夹，并在循环中对子文件夹进行处理。

在处理每个子文件夹时，如果子文件夹的名称是 vdn.v1.0.0，我们构建 consensus.fa.gz 文件的路径并检查其是否存在。如果文件存在，我们使用 gzip.open 打开文件，并以文本模式 'rt' 进行读取。然后将文件内容写入输出文件中。

如果子文件夹的名称不是 vdn.v1.0.0，我们递归调用 process_folder 函数，传递子文件夹路径和输出文件作为参数，以处理子文件夹中的子文件夹。

======================================





在这个修改后的代码中，我添加了一个名为 output_files 的列表，其中包含三个输出文件的路径，分别对应于D、S和E文件夹的结果。

在主程序的主循环中，我们使用 enumerate 函数同时迭代 folders 列表中的文件夹名称，并获取对应的输出文件路径。然后，我们使用 with open 语句打开输出文件，并将文件路径作为参数传递给 process_folder 函数。

这样，process_folder 函数将会将相应文件夹的结果写入正确的输出文件中。

## 1.2_Filter (Screen_of_MAPLE-treated_samples)

In [11]:
file_path = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/MAPLE0.3.2_rateVar_errors_realData_checkingErrors_new_all_estimatedErrors.txt"
data_set = set()

with open(file_path, 'r') as file:
    for line in file:
        if line.startswith(">"):
            data_set.add(line.strip())

In [12]:
print(len(data_set))
data_set

171135


{'>ERR6464283',
 '>SRR19472683',
 '>SRR20947009',
 '>SRR22117371',
 '>ERR6498472',
 '>SRR19928928',
 '>ERR4905463',
 '>ERR7802531',
 '>SRR23069095',
 '>ERR10126344',
 '>SRR21139876',
 '>ERR6773250',
 '>ERR7846864',
 '>SRR21212594',
 '>ERR10025113',
 '>SRR20915318',
 '>ERR6186598',
 '>SRR20990400',
 '>SRR21296892',
 '>SRR22238873',
 '>ERR6316370',
 '>ERR7878148',
 '>ERR6766690',
 '>SRR20027508',
 '>ERR4461214',
 '>ERR6681556',
 '>ERR6484337',
 '>ERR10037170',
 '>SRR21737555',
 '>SRR21611488',
 '>ERR10024545',
 '>ERR7378491',
 '>ERR6529774',
 '>ERR6085993',
 '>ERR6770308',
 '>ERR6648137',
 '>ERR6466958',
 '>SRR22237311',
 '>ERR5020864',
 '>ERR6076023',
 '>ERR7698293',
 '>ERR4890765',
 '>SRR22565327',
 '>SRR22239934',
 '>ERR6485612',
 '>SRR21791760',
 '>SRR21703867',
 '>ERR7852651',
 '>SRR23593708',
 '>SRR21794198',
 '>SRR21153525',
 '>ERR8183225',
 '>ERR4893343',
 '>SRR20872827',
 '>SRR21907013',
 '>ERR6693986',
 '>SRR20049357',
 '>ERR6535175',
 '>SRR23057437',
 '>ERR7898357',
 '>SRR1948

In [6]:
def check_startswith_s(my_set):
    for item in my_set:
        if item.startswith(">E"):
            return True
    return False

result = check_startswith_s(data_set)
print(result)

True


In [11]:
if ">SRR20593827" in data_set:
    print("111")
else:
    print("2222")

111


```bash
bsub -M 20000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_D_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_D.py'

bsub -M 20000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_S_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_S.py'

bsub -M 20000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_E_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/1.2_Filter_For_E.py'
```

In [None]:
import os

def process_file(input_filename, output_dir):
    with open(input_filename, "r") as file:
        current_name = None
        current_content = []
        for line in file:
            # 这一行是一个序列名称
            if line.startswith(">"):
                # 如果之前有在data_set中的序列，那么现在将它保存到文件
                if current_name is not None and current_name.split(".")[0] in data_set:
                    output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
                    with open(output_path, "w") as out_file:
                        out_file.write(current_name + "\n" + "".join(current_content))
                
                # 更新当前序列的名称，并清空内容列表以备下次使用
                current_name = line.strip()
                current_content = []
            else:
                # 这一行是序列的一部分，添加到当前内容中
                current_content.append(line)

        # 处理文件末尾的序列
        if current_name is not None and current_name.split(".")[0] in data_set:
            output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
            with open(output_path, "w") as out_file:
                out_file.write(current_name + "\n" + "".join(current_content))

# 调用函数处理你的文件，并指定输出文件夹
input_file = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S.txt"
output_folder = "TEST/"
process_file(input_file, output_folder)

In [9]:
import os

# 源文件夹路径
folder_path = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E/'

# 目标文件路径
output_file = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E/all_files_combined.fasta'

# 获取源文件夹中的所有txt文件
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# 逐个读取txt文件，并将内容写入目标文件
with open(output_file, 'w') as output:
    for txt_file in txt_files:
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, 'r') as f:
            content = f.read()
            output.write(content)
            output.write('\n')  # 在每个文件的内容后面添加换行符，以便区分

print("所有txt文件已成功合并并写入到", output_file)

所有txt文件已成功合并并写入到 /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E/all_files_combined.fasta


In [10]:
import os

# 源文件夹路径
folder_path = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S/'

# 目标文件路径
output_file = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S/all_files_combined.fasta'

# 获取源文件夹中的所有txt文件
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# 逐个读取txt文件，并将内容写入目标文件
with open(output_file, 'w') as output:
    for txt_file in txt_files:
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, 'r') as f:
            content = f.read()
            output.write(content)
            output.write('\n')  # 在每个文件的内容后面添加换行符，以便区分

print("所有txt文件已成功合并并写入到", output_file)


所有txt文件已成功合并并写入到 /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S/all_files_combined.fasta


## 1.2_Sequence_alignment

#### 1. split

```bash
bsub sh /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Aligned.sh
```

#### 1. Origin
```bash
sh bash_MAPLE_part.sh
```
```python
save path: /nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May
```

#### 2. Martin

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2_Martin_part_E_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.Martin_for_E.py'
```

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2_Martin_part_S_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.Martin_for_S.py'
```

```python
save path: /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May
```

In [None]:
from Bio import SeqIO
import os

def process_sequences(input_file, output_folder):
    for record in SeqIO.parse(input_file, "fasta"):
        sequence_name = record.id
        sequence_data = str(record.seq)
        sequence_name = sequence_name.replace(".masked", "")  # 移除".masked"后缀
        save_sequence(sequence_name, sequence_data, output_folder)


def save_sequence(sequence_name, sequence_data, output_folder):
    output_file = os.path.join(output_folder, f"{sequence_name}.txt")

    with open(output_file, 'w') as f:
        f.write(f">{sequence_name}\n")
        f.write(sequence_data)


input_file = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E_aligned.fasta'
output_folder = 'TEST/'
process_sequences(input_file, output_folder)

## 2.2_Combine into one DF

```python
output path: /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/2_combination
```

### Test for single file
```python
import os
import pandas as pd
import numpy as np

folder_path = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May/"
folder_path_2 = "/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

# 获取第一个文件的完整路径
first_file_path = os.path.join(folder_path, file_names[0])

# 读取第一个文件内容
with open(first_file_path, 'r') as file:
    lines = file.readlines()

# 除去FASTA标题，然后删除换行符
sequence = ''.join(lines[1:]).replace('\n', '')

# 将每个碱基及其位置添加到一个列表中
sequence_list = [{'position': i, 'nucleotide_martin': base} for i, base in enumerate(sequence)]

# 将列表转换为DataFrame
df1 = pd.DataFrame(sequence_list)

# 读取folder_path_2中与第一个文件同名的文件
second_file_path = os.path.join(folder_path_2, file_names[0])

# 读取第二个文件内容
with open(second_file_path, 'r') as file:
    lines = file.readlines()

# 除去FASTA标题，然后删除换行符
sequence2 = ''.join(lines[1:]).replace('\n', '')

# 将每个碱基及其位置添加到一个列表中
sequence_list2 = [{'position': i, 'nucleotide_origin': base} for i, base in enumerate(sequence2)]

# 将列表转换为DataFrame
df2 = pd.DataFrame(sequence_list2)

# 使用position列合并两个DataFrame
merged_df = pd.merge(df1, df2, on='position')


### 添加判断列

# 判断nucleotide_martin和nucleotide_origin两列是否相同，添加label列
merged_df['label'] = np.where(merged_df['nucleotide_martin'] == merged_df['nucleotide_origin'], 1, 0)
# 判断nucleotide_martin和nucleotide_origin两列是否都为'-'或'N'，添加label2列
merged_df['label2'] = np.where((merged_df['nucleotide_martin'] == '-') & (merged_df['nucleotide_origin'] == '-') |
                              (merged_df['nucleotide_martin'] == 'n') & (merged_df['nucleotide_origin'] == 'n'), 1, 0)
# 判断nucleotide_martin列是否为'-'或'N'，添加label_mar列
merged_df['label_mar'] = np.where((merged_df['nucleotide_martin'] == '-') | (merged_df['nucleotide_martin'] == 'n'), 1, 0)
# 判断nucleotide_origin列是否为'-'或'N'，添加label_ori列
merged_df['label_ori'] = np.where((merged_df['nucleotide_origin'] == '-') | (merged_df['nucleotide_origin'] == 'n'), 1, 0)

merged_df.to_csv(file_names[0], sep='\t', index=False)
```

### Unencapsulated
```python
import os
import pandas as pd
import numpy as np

folder_path = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May/"
folder_path_2 = "/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

for k in range(len(file_names)):
    # 获取第一个文件的完整路径
    first_file_path = os.path.join(folder_path, file_names[k])

    # 读取第一个文件内容
    with open(first_file_path, 'r') as file:
        lines = file.readlines()

    # 除去FASTA标题，然后删除换行符
    sequence = ''.join(lines[1:]).replace('\n', '')

    # 将每个碱基及其位置添加到一个列表中
    sequence_list = [{'position': i, 'nucleotide_martin': base} for i, base in enumerate(sequence)]

    # 将列表转换为DataFrame
    df1 = pd.DataFrame(sequence_list)

    # 读取folder_path_2中与第一个文件同名的文件
    second_file_path = os.path.join(folder_path_2, file_names[k])

    # 读取第二个文件内容
    with open(second_file_path, 'r') as file:
        lines = file.readlines()

    # 除去FASTA标题，然后删除换行符
    sequence2 = ''.join(lines[1:]).replace('\n', '')

    # 将每个碱基及其位置添加到一个列表中
    sequence_list2 = [{'position': i, 'nucleotide_origin': base} for i, base in enumerate(sequence2)]

    # 将列表转换为DataFrame
    df2 = pd.DataFrame(sequence_list2)

    # 使用position列合并两个DataFrame
    merged_df = pd.merge(df1, df2, on='position')


    ### 添加判断列

    # 判断nucleotide_martin和nucleotide_origin两列是否相同，添加label列
    merged_df['label_same'] = np.where(merged_df['nucleotide_martin'] == merged_df['nucleotide_origin'], 1, 0)
    # 判断nucleotide_martin和nucleotide_origin两列是否都为'-'或'N'，添加label2列
    merged_df['label_marked'] = np.where((merged_df['nucleotide_martin'] == '-') & (merged_df['nucleotide_origin'] == '-') |
                                  (merged_df['nucleotide_martin'] == 'n') & (merged_df['nucleotide_origin'] == 'n'), 1, 0)
    # 判断nucleotide_martin列是否为'-'或'N'，添加label_mar列
    merged_df['label_mar'] = np.where((merged_df['nucleotide_martin'] == '-') | (merged_df['nucleotide_martin'] == 'n'), 1, 0)
    # 判断nucleotide_origin列是否为'-'或'N'，添加label_ori列
    merged_df['label_ori'] = np.where((merged_df['nucleotide_origin'] == '-') | (merged_df['nucleotide_origin'] == 'n'), 1, 0)

    merged_df.to_csv(file_names[k], sep='\t', index=False)
```

In [15]:
count_label_1 = merged_df['label'].value_counts()[1]
print("label equal to 1:", count_label_1)

label列值为1的个数: 25005


### Final version!!!
```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.2_combination_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.2_combination.py'
```

In [None]:
import os
import pandas as pd
import numpy as np

def read_sequence(file_path):
    '''This function reads a FASTA file and returns a pandas DataFrame where
    each row corresponds to the base and its position in the sequence'''
    
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sequence = ''.join(lines[1:]).replace('\n', '')
    sequence_list = [{'position': i, 'nucleotide': base} for i, base in enumerate(sequence)]

    return pd.DataFrame(sequence_list)


def main(output_folder):
    folder_path_1 = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May/"
    folder_path_2 = "/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    file_names = os.listdir(folder_path_1)

    for file_name in file_names:
        file_path_1 = os.path.join(folder_path_1, file_name)
        df1 = read_sequence(file_path_1)
        df1.rename(columns={'nucleotide': 'nucleotide_martin'}, inplace=True)

        file_path_2 = os.path.join(folder_path_2, file_name)

        # Check if the file exists in folder_path_2, if not, skip to the next file
        if not os.path.exists(file_path_2):
            del df1
            continue

        df2 = read_sequence(file_path_2)
        df2.rename(columns={'nucleotide': 'nucleotide_origin'}, inplace=True)

        merged_df = pd.merge(df1, df2, on='position')

        # Adding decision columns
        merged_df['label_same'] = np.where(merged_df['nucleotide_martin'] == merged_df['nucleotide_origin'], 1, 0)
        merged_df['label_marked'] = np.where(
            (merged_df['nucleotide_martin'].isin(['-', 'n'])) & 
            (merged_df['nucleotide_origin'].isin(['-', 'n'])), 1, 0)
        merged_df['label_mar'] = np.where(merged_df['nucleotide_martin'].isin(['-', 'n']), 1, 0)
        merged_df['label_ori'] = np.where(merged_df['nucleotide_origin'].isin(['-', 'n']), 1, 0)
        
        # Check if the file already exists in the output folder, if yes, skip to the next file
        if os.path.exists(os.path.join(output_folder, file_name)):
            del df1
            del df2
            del merged_df
            continue

        # Save the file to the specified output folder
        merged_df.to_csv(os.path.join(output_folder, file_name), sep='\t', index=False)

        # To manage memory, clear variables that are no longer needed
        del df1
        del df2
        del merged_df

if __name__ == "__main__":
    main("/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/2_combination")

## 2.3_calculate percentage

### Part1_for_all_pos

### Test for single file
```python
import os
import pandas as pd
import numpy as np

folder_path = "TEST/"

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

# 获取第一个文件的完整路径
first_file_path = os.path.join(folder_path, file_names[0])

df = pd.read_csv(first_file_path, sep='\t')
count_label_1 = df['label'].value_counts()[1]
print("label equal to 1:", count_label_1)
count_label_2 = df['label2'].value_counts()[1]
print("label equal to 1:", count_label_2)
count_label_3 = df['label_mar'].value_counts()[1]
print("label equal to 1:", count_label_3)
count_label_4 = df['label_ori'].value_counts()[1]
print("label equal to 1:", count_label_4)
```

### Test for single file
```python
import os
import pandas as pd

folder_path = "TEST/"

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

# 初始化计数变量
count_label_1_total = 0
count_label_2_total = 0
count_label_3_total = 0
count_label_4_total = 0

# 遍历每个文件
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, sep='\t')
    
    # 计算每个文件中的label计数
    count_label_1 = df['label'].value_counts().get(1, 0)
    count_label_2 = df['label2'].value_counts().get(1, 0)
    count_label_3 = df['label_mar'].value_counts().get(1, 0)
    count_label_4 = df['label_ori'].value_counts().get(1, 0)
    
    # 累加计数到总计数变量
    count_label_1_total += count_label_1
    count_label_2_total += count_label_2
    count_label_3_total += count_label_3
    count_label_4_total += count_label_4

# 打印总计数
print("Both versions have the same nucleotide type:", count_label_1_total/(29903*len(file_names))*100, ' %')
print("They are all marked:", count_label_2_total/(29903*len(file_names))*100, ' %')
print("Only the martin version is marked:", count_label_3_total/(29903*len(file_names))*100, ' %')
print("Only the original version is marked:", count_label_4_total/(29903*len(file_names))*100, ' %')
```

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.3_Calculate_percentage_all_pos_errorChecking_error.txt -o /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/2.3_calculate_percentage/result_for_all_pos.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.3_Calculate_percentage_all_pos.py'
```

In [54]:
import os
import pandas as pd

folder_path = "TEST/"
batch_size = 1000  # 每批次处理的文件数

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

# 初始化计数变量
count_label_1_total = 0
count_label_2_total = 0
count_label_3_total = 0
count_label_4_total = 0

# 按批次处理文件
num_batches = (len(file_names) + batch_size - 1) // batch_size  # 计算批次数量
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = (batch_idx + 1) * batch_size
    batch_files = file_names[start_idx:end_idx]

    # 遍历批次中的每个文件
    for file_name in batch_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t')
        
        # 计算每个文件中的label计数
        count_label_1 = df['label'].value_counts().get(1, 0)
        count_label_2 = df['label2'].value_counts().get(1, 0)
        count_label_3 = df['label_mar'].value_counts().get(1, 0)
        count_label_4 = df['label_ori'].value_counts().get(1, 0)
        
        # 累加计数到总计数变量
        count_label_1_total += count_label_1
        count_label_2_total += count_label_2
        count_label_3_total += count_label_3
        count_label_4_total += count_label_4

# 打印总计数
total_files = len(file_names)
total_records = 29903 * total_files
percentage_1 = round(count_label_1_total / total_records * 100, 3)
percentage_2 = round(count_label_2_total / total_records * 100, 3)
percentage_3 = round(count_label_3_total / total_records * 100, 3)
percentage_4 = round(count_label_4_total / total_records * 100, 3)

print("Both versions have the same nucleotide type:", percentage_1, '%')
print("Both versions are marked:", percentage_2, '%')
print("Only the martin version is marked:", percentage_3, '%')
print("Only the original version is marked:", percentage_4, '%')

Both versions have the same nucleotide type: 95.106 %
Both versions are marked: 0.279 %
Only the martin version is marked: 0.903 %
Only the original version is marked: 4.824 %


### Part2_for_err_pos

### Test for single file
```python
import os
import pandas as pd

# 读取A文件夹中的数据
a_file_path = '/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/output_modified.txt'  # 替换为A文件夹中数据的路径
a_data = pd.read_csv(a_file_path, delimiter='\t')

# 定义B文件夹路径和列名
b_folder_path = "TEST/"  # 替换为B文件夹的路径
b_column_names = ['position', 'nucleotide_martin', 'nucleotide_origin', 'label', 'label2', 'label_mar', 'label_ori']

# 指定输出文件路径
output_file_path = 'output_data.txt'  # 替换为输出文件的路径

# 遍历A数据中的每一行
for index, row in a_data.iterrows():
    # 获取当前行的ID和Positionaa
    id_value = row['ID']
    position_value = row['Position']
    
    # 构建B文件路径
    b_file_path = os.path.join(b_folder_path, f'{id_value}.txt')
    
    # 检查B文件是否存在
    if not os.path.isfile(b_file_path):
        # print(f'File {b_file_path} not found.')
        continue
    
    # 读取B文件中的数据
    b_data = pd.read_csv(b_file_path, delimiter='\t')
    
    # 根据Position匹配行
    matched_row = b_data[b_data['position'] == position_value]
    
    # 检查是否找到匹配的行
    if matched_row.empty:
        print(f'No matching row found in {b_file_path} for ID {id_value} and position {position_value}.')
        continue
    
    # 提取需要添加到A数据的列
    columns_to_add = matched_row[b_column_names]
    
    # 将列添加到A数据中
    a_data.loc[index, b_column_names] = columns_to_add.values.tolist()[0]

# 将处理后的数据保存到输出文件
a_data = a_data[['ID', 'Position', 'nucleotide_martin', 'nucleotide_origin', 
                 'label', 'label2', 'label_mar', 'label_ori']]
a_data.to_csv(output_file_path, sep='\t', index=False)

# 输出完成消息
print(f"数据已保存到 {output_file_path}")
```

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.3_Calculate_percentage_err_pos_errorChecking_error.txt -o /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/2.3_calculate_percentage/result_for_err_pos.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.3_Calculate_percentage_err_pos.py'
```

In [67]:
import os
import pandas as pd

def read_file(file_path, delimiter='\t'):
    """Reads a file into a pandas DataFrame."""
    return pd.read_csv(file_path, delimiter=delimiter)

def fetch_file_path(folder_path, file_id, extension='.txt'):
    """Builds the file path from the folder path and file id."""
    return os.path.join(folder_path, f'{file_id}{extension}')

def match_row(df, column, value):
    """Finds rows in a DataFrame that match a specific value in a specific column."""
    return df[df[column] == value]

def append_columns(df_source, df_target, index, columns):
    """Appends columns from a source DataFrame to a target DataFrame."""
    df_target.loc[index, columns] = df_source[columns].values.tolist()[0]

def save_file(df, file_path, columns, delimiter='\t', index=False):
    """Saves a DataFrame to a file."""
    df[columns].to_csv(file_path, sep=delimiter, index=index)

# Specify paths and column names 
a_file_path = '/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/output_modified.txt'
b_folder_path = "TEST/"
b_column_names = ['position', 'nucleotide_martin', 'nucleotide_origin', 'label', 'label2', 'label_mar', 'label_ori']
output_file_path = 'output_data.txt'

# Load the data from the 'A' file
a_data = read_file(a_file_path)

# Iterate through each row of the 'A' data
for index, row in a_data.iterrows():
    # Fetch the path of the corresponding 'B' file
    b_file_path = fetch_file_path(b_folder_path, row['ID'])
    
    # Continue to the next iteration if the 'B' file doesn't exist
    if not os.path.isfile(b_file_path):
        continue

    # Load the data from the 'B' file
    b_data = read_file(b_file_path)

    # Find the matching row in the 'B' data
    matched_row = match_row(b_data, 'position', row['Position'])

    # Continue to the next iteration if no matching row was found
    if matched_row.empty:
        print(f'No matching row found in {b_file_path} for ID {row["ID"]} and position {row["Position"]}.')
        continue

    # Append the necessary columns to the 'A' data
    append_columns(matched_row, a_data, index, b_column_names)

# Save the modified 'A' data
save_file(a_data, output_file_path, ['ID', 'Position'] + b_column_names)

print(f"Data saved to {output_file_path}")

Data saved to output_data.txt


In [14]:
### calculate_percentage
test = pd.read_csv(output_file_path, sep='\t')
test = test.dropna()

# 初始化计数变量
count_label_1_total = sum(test['label'] == 1.0)
count_label_2_total = sum(test['label2'] == 1.0)
count_label_3_total = sum(test['label_mar'] == 1.0)
count_label_4_total = sum(test['label_ori'] == 1.0)

# 打印总计数
total_records = len(test)
percentage_1 = round(count_label_1_total / total_records * 100, 3)
percentage_2 = round(count_label_2_total / total_records * 100, 3)
percentage_3 = round(count_label_3_total / total_records * 100, 3)
percentage_4 = round(count_label_4_total / total_records * 100, 3)

print("Both versions have the same nucleotide type:", percentage_1, '%')
print("Both versions are marked:", percentage_2, '%')
print("Only the martin version is marked:", percentage_3, '%')
print("Only the original version is marked:", percentage_4, '%')

Both versions have the same nucleotide type: 100.0 %
Both versions are marked: 0.0 %
Only the martin version is marked: 0.0 %
Only the original version is marked: 0.0 %
