## 1_Extract_assembled_sequences

In [None]:
import os
import gzip
import argparse

base_path = "/nfs/research/zi/mhunt/Viridian_wf_paper/Vdn_all_ena/Reads/"

output_files = {
    'D': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D_new.txt",
    'S': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S_new.txt",
    'E': "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_E_new.txt"
}

def process_folder(folder_path, output_file):
    try:
        subfolders = [subfolder for subfolder in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, subfolder))]
    except PermissionError:
        print(f"Permission denied when trying to access the directory: {folder_path}")
        return 

    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if subfolder == "vdn.v1.0.0":
            file_path = os.path.join(subfolder_path, "consensus.fa.gz")
            try:
                if os.path.exists(file_path):
                    with gzip.open(file_path, 'rt') as f_in:
                        content = f_in.read()
                        with open(output_file, "a") as f_out:
                            f_out.write(content)
            except Exception as e:
                print(f"Error with file {file_path}: {e}")
        else:
            process_folder(subfolder_path, output_file)

def main(folder):
    output_file = output_files[folder]
    with open(output_file, "w") as f_out:
        folder_path = os.path.join(base_path, folder)
        process_folder(folder_path, output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some files.")
    parser.add_argument('folder', choices=['D', 'S', 'E'], help="The folder to process")
    args = parser.parse_args()

    main(args.folder)


## 1.2_Filter (Screen_of_MAPLE-treated_samples)

In [None]:
import os

file_path = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/MAPLE0.3.2_rateVar_errors_realData_checkingErrors_new_all_estimatedErrors.txt"
data_set = set()

with open(file_path, 'r') as file:
    for line in file:
        if line.startswith(">"):
            data_set.add(line.strip())
            
def process_file(input_filename, output_dir):
    with open(input_filename, "r") as file:
        current_name = None
        current_content = []
        for line in file:
            # This line is a sequence name
            if line.startswith(">"):
                 # If there was a sequence in data_set before, then save it to a file now
                if current_name is not None and current_name.split(".")[0] in data_set:
                    output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
                    with open(output_path, "w") as out_file:
                        out_file.write(current_name + "\n" + "".join(current_content))
                
                # Update the name of the current sequence and clear the content list for next time
                current_name = line.strip()
                current_content = []
            else:
                # This line is part of the sequence and is added to the current content
                current_content.append(line)

        # Process end-of-file sequences
        if current_name is not None and current_name.split(".")[0] in data_set:
            output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
            with open(output_path, "w") as out_file:
                out_file.write(current_name + "\n" + "".join(current_content))

input_file = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D.txt"
output_folder = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D"
process_file(input_file, output_folder)

## combine all files
output_file = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_D/all_files_combined.fasta'

txt_files = [f for f in os.listdir(output_folder) if f.endswith('.txt')]

with open(output_file, 'w') as output:
    for txt_file in txt_files:
        file_path = os.path.join(output_folder, txt_file)
        with open(file_path, 'r') as f:
            content = f.read()
            output.write(content)
            output.write('\n')  # 在每个文件的内容后面添加换行符，以便区分

## 1.2_Sequence_alignment

## 1. split

#### 1. Origin
```bash
sh bash_MAPLE_part.sh
```
```python
save path: /nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May
```

In [None]:
import os
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Process some files.')
    parser.add_argument('--input_file', '-i', required=True, help='Path to input file')
    parser.add_argument('--output_folder', '-o', required=True, help='Path to output folder')

    return parser.parse_args()

def main():
    args = parse_args()

    file_path = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/MAPLE0.3.2_rateVar_errors_realData_checkingErrors_new_all_estimatedErrors.txt"
    input_file = args.input_file
    output_folder = args.output_folder

    data_set = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith(">"):
                data_set.add(line.strip())

    current_sequence = ''
    current_name = None

    with open(input_file, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence and current_name in data_set:
                    output_file_path = os.path.join(output_folder, current_name[1:] + '.txt')
                    with open(output_file_path, 'w') as output_file:
                        output_file.write(current_name + '\n')
                        output_file.write(current_sequence)
                current_name = line
                current_sequence = ''
            else:
                current_sequence += line

    # 处理最后一个序列
    if current_sequence and current_name in data_set:
        output_file_path = os.path.join(output_folder, current_name[1:] + '.txt')
        with open(output_file_path, 'w') as output_file:
            output_file.write(current_name + '\n')
            output_file.write(current_sequence)

if __name__ == "__main__":
    main()


#### 2. Martin

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2_Martin_part_E_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.Martin_for_E.py'
```

```bash
bsub -M 2000 -e /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2_Martin_part_S_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Part2_Compare/2.Martin_for_S.py'
```

```python
save path: /nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May
```

In [None]:
from Bio import SeqIO
import os

def process_sequences(input_file, output_folder):
    for record in SeqIO.parse(input_file, "fasta"):
        sequence_name = record.id
        sequence_data = str(record.seq)
        sequence_name = sequence_name.replace(".masked", "")  # 移除".masked"后缀
        save_sequence(sequence_name, sequence_data, output_folder)


def save_sequence(sequence_name, sequence_data, output_folder):
    output_file = os.path.join(output_folder, f"{sequence_name}.txt")

    with open(output_file, 'w') as f:
        f.write(f">{sequence_name}\n")
        f.write(sequence_data)


input_file = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/output_S_aligned.fasta'
output_folder = '/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May'
process_sequences(input_file, output_folder)

## 2.MAFFT alignment

```bash
bsub sh /nfs/research/goldman/zihao/errorsProject_1/Part1_2_for_assemble/Aligned.sh
```

## 2.2_Combine into one DF

In [None]:
import os
import pandas as pd
import numpy as np

def read_sequence(file_path):
    '''This function reads a FASTA file and returns a pandas DataFrame where
    each row corresponds to the base and its position in the sequence'''
    
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sequence = ''.join(lines[1:]).replace('\n', '')
    sequence_list = [{'position': i, 'nucleotide': base} for i, base in enumerate(sequence)]

    return pd.DataFrame(sequence_list)


def main(output_folder):
    folder_path_1 = "/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/Aligned_split_May/"
    folder_path_2 = "/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    file_names = os.listdir(folder_path_1)

    for file_name in file_names:
        file_path_1 = os.path.join(folder_path_1, file_name)
        df1 = read_sequence(file_path_1)
        df1.rename(columns={'nucleotide': 'nucleotide_martin'}, inplace=True)

        file_path_2 = os.path.join(folder_path_2, file_name)

        # Check if the file exists in folder_path_2, if not, skip to the next file
        if not os.path.exists(file_path_2):
            del df1
            continue

        df2 = read_sequence(file_path_2)
        df2.rename(columns={'nucleotide': 'nucleotide_origin'}, inplace=True)

        merged_df = pd.merge(df1, df2, on='position')

        # Adding decision columns
        merged_df['label_same'] = np.where(merged_df['nucleotide_martin'] == merged_df['nucleotide_origin'], 1, 0)
        merged_df['label_marked'] = np.where(
            (merged_df['nucleotide_martin'].isin(['-', 'n'])) & 
            (merged_df['nucleotide_origin'].isin(['-', 'n'])), 1, 0)
        merged_df['label_mar'] = np.where(merged_df['nucleotide_martin'].isin(['-', 'n']), 1, 0)
        merged_df['label_ori'] = np.where(merged_df['nucleotide_origin'].isin(['-', 'n']), 1, 0)
        
        # Check if the file already exists in the output folder, if yes, skip to the next file
        if os.path.exists(os.path.join(output_folder, file_name)):
            del df1
            del df2
            del merged_df
            continue

        # Save the file to the specified output folder
        merged_df.to_csv(os.path.join(output_folder, file_name), sep='\t', index=False)

        # To manage memory, clear variables that are no longer needed
        del df1
        del df2
        del merged_df

if __name__ == "__main__":
    main("/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/2_combination")

## 2.3_calculate percentage

### Part1_for_all_pos

In [54]:
import os
import pandas as pd

folder_path = "TEST/"
batch_size = 1000  # 每批次处理的文件数

# 获取文件夹中的所有文件名
file_names = os.listdir(folder_path)

# 初始化计数变量
count_label_1_total = 0
count_label_2_total = 0
count_label_3_total = 0
count_label_4_total = 0

# 按批次处理文件
num_batches = (len(file_names) + batch_size - 1) // batch_size  # 计算批次数量
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = (batch_idx + 1) * batch_size
    batch_files = file_names[start_idx:end_idx]

    # 遍历批次中的每个文件
    for file_name in batch_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t')
        
        # 计算每个文件中的label计数
        count_label_1 = df['label'].value_counts().get(1, 0)
        count_label_2 = df['label2'].value_counts().get(1, 0)
        count_label_3 = df['label_mar'].value_counts().get(1, 0)
        count_label_4 = df['label_ori'].value_counts().get(1, 0)
        
        # 累加计数到总计数变量
        count_label_1_total += count_label_1
        count_label_2_total += count_label_2
        count_label_3_total += count_label_3
        count_label_4_total += count_label_4

# 打印总计数
total_files = len(file_names)
total_records = 29903 * total_files
percentage_1 = round(count_label_1_total / total_records * 100, 3)
percentage_2 = round(count_label_2_total / total_records * 100, 3)
percentage_3 = round(count_label_3_total / total_records * 100, 3)
percentage_4 = round(count_label_4_total / total_records * 100, 3)

print("Both versions have the same nucleotide type:", percentage_1, '%')
print("Both versions are marked:", percentage_2, '%')
print("Only the martin version is marked:", percentage_3, '%')
print("Only the original version is marked:", percentage_4, '%')

Both versions have the same nucleotide type: 95.106 %
Both versions are marked: 0.279 %
Only the martin version is marked: 0.903 %
Only the original version is marked: 4.824 %


### Part2_for_err_pos

In [67]:
import os
import pandas as pd

def read_file(file_path, delimiter='\t'):
    """Reads a file into a pandas DataFrame."""
    return pd.read_csv(file_path, delimiter=delimiter)

def fetch_file_path(folder_path, file_id, extension='.txt'):
    """Builds the file path from the folder path and file id."""
    return os.path.join(folder_path, f'{file_id}{extension}')

def match_row(df, column, value):
    """Finds rows in a DataFrame that match a specific value in a specific column."""
    return df[df[column] == value]

def append_columns(df_source, df_target, index, columns):
    """Appends columns from a source DataFrame to a target DataFrame."""
    df_target.loc[index, columns] = df_source[columns].values.tolist()[0]

def save_file(df, file_path, columns, delimiter='\t', index=False):
    """Saves a DataFrame to a file."""
    df[columns].to_csv(file_path, sep=delimiter, index=index)

# Specify paths and column names 
a_file_path = '/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/output_modified.txt'
b_folder_path = "TEST/"
b_column_names = ['position', 'nucleotide_martin', 'nucleotide_origin', 'label', 'label2', 'label_mar', 'label_ori']
output_file_path = 'output_data.txt'

# Load the data from the 'A' file
a_data = read_file(a_file_path)

# Iterate through each row of the 'A' data
for index, row in a_data.iterrows():
    # Fetch the path of the corresponding 'B' file
    b_file_path = fetch_file_path(b_folder_path, row['ID'])
    
    # Continue to the next iteration if the 'B' file doesn't exist
    if not os.path.isfile(b_file_path):
        continue

    # Load the data from the 'B' file
    b_data = read_file(b_file_path)

    # Find the matching row in the 'B' data
    matched_row = match_row(b_data, 'position', row['Position'])

    # Continue to the next iteration if no matching row was found
    if matched_row.empty:
        print(f'No matching row found in {b_file_path} for ID {row["ID"]} and position {row["Position"]}.')
        continue

    # Append the necessary columns to the 'A' data
    append_columns(matched_row, a_data, index, b_column_names)

# Save the modified 'A' data
save_file(a_data, output_file_path, ['ID', 'Position'] + b_column_names)

print(f"Data saved to {output_file_path}")

Data saved to output_data.txt


In [14]:
### calculate_percentage
test = pd.read_csv(output_file_path, sep='\t')
test = test.dropna()

# 初始化计数变量
count_label_1_total = sum(test['label'] == 1.0)
count_label_2_total = sum(test['label2'] == 1.0)
count_label_3_total = sum(test['label_mar'] == 1.0)
count_label_4_total = sum(test['label_ori'] == 1.0)

# 打印总计数
total_records = len(test)
percentage_1 = round(count_label_1_total / total_records * 100, 3)
percentage_2 = round(count_label_2_total / total_records * 100, 3)
percentage_3 = round(count_label_3_total / total_records * 100, 3)
percentage_4 = round(count_label_4_total / total_records * 100, 3)

print("Both versions have the same nucleotide type:", percentage_1, '%')
print("Both versions are marked:", percentage_2, '%')
print("Only the martin version is marked:", percentage_3, '%')
print("Only the original version is marked:", percentage_4, '%')

Both versions have the same nucleotide type: 100.0 %
Both versions are marked: 0.0 %
Only the martin version is marked: 0.0 %
Only the original version is marked: 0.0 %
