### 1.Split processing

```bash
bsub -M 20000 -e /nfs/research/goldman/zihao/compViridian_2/1_extract_sequence/pg_for_extract_2_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/compViridian_2/1_extract_sequence/pg_for_extract_2.py'
```

In [1]:
import os

folder_path = '/nfs/research/goldman/zihao/Datas/p2_comp_viridian/1_extract_sequence'
output_folder = '/nfs/research/goldman/zihao/Datas/p2_comp_viridian/1_extract_sequence/split_files'
files_per_txt = 10

# Get the list of txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Limit the number of output files to a maximum of 10
num_output_files = min(len(txt_files), 10)

# Calculate the number of files per output txt
files_per_output = (len(txt_files) + num_output_files - 1) // num_output_files

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Split and write to txt files
for i in range(num_output_files):
    start_index = i * files_per_output
    end_index = (i + 1) * files_per_output
    output_file_path = os.path.join(output_folder, f'output_{i + 1}.fasta')

    with open(output_file_path, 'w') as output_file:
        for file_name in txt_files[start_index:end_index]:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as input_file:
                output_file.write(input_file.read())
            output_file.write('\n')

print('File splitting and writing completed.')

KeyboardInterrupt: 

### 2.MAFFT alignment
```bash
sh /nfs/research/goldman/zihao/compViridian_2/2_alignment/Aligned.sh
```

### 3.split_aligned_file

```bash
sh bash_for_split_aligned.sh
```

In [None]:
import os
import argparse

file_path = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/new_version_MAY/MAPLE0.3.2_rateVar_errors_realData_checkingErrors_new_all_estimatedErrors.txt"
data_set = set()

with open(file_path, 'r') as file:
    for line in file:
        if line.startswith(">"):
            data_set.add(line.strip())

def process_file(input_filename, output_dir):
    with open(input_filename, 'r') as file:
        current_name = None
        current_content = []
        for line in file:
            if line.startswith(">"):
                if current_name is not None and current_name.split(".")[0] in data_set:
                    output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
                    with open(output_path, "w") as out_file:
                        out_file.write(current_name + "\n" + "".join(current_content))

                current_name = line.strip()
                current_content = []
            else:
                current_content.append(line)

        if current_name is not None and current_name.split(".")[0] in data_set:
            output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
            with open(output_path, "w") as out_file:
                out_file.write(current_name + "\n" + "".join(current_content))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some files.")
    parser.add_argument("-i", "--input_file", required=True, help="The input FASTA file to be processed")
    parser.add_argument("-o", "--output_dir", required=True, help="The output directory to store processed files")

    args = parser.parse_args()

    process_file(args.input_file, args.output_dir)


In [None]:
import os
import argparse
import gzip
            
def process_file(input_filename, output_dir):
    with gzip.open(input_filename, 'rt') as file:
        current_name = None
        current_content = []
        for line in file:
            if line.startswith(">"):
                if current_name is not None:
                    output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
                    with open(output_path, "w") as out_file:
                        out_file.write(current_name + "\n" + "".join(current_content))
                
                current_name = line.strip()
                current_content = []
            else:
                current_content.append(line)

        if current_name is not None:
            output_path = os.path.join(output_dir, current_name.split(".")[0][1:] + ".txt")
            with open(output_path, "w") as out_file:
                out_file.write(current_name + "\n" + "".join(current_content))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some files.")
    parser.add_argument("-i", "--input_file", required=True, help="The input file to be processed")
    parser.add_argument("-o", "--output_dir", required=True, help="The output directory to store processed files")

    args = parser.parse_args()

    process_file(args.input_file, args.output_dir)

### 4.Combine into one DF
- **Input**: 
```python
viridian version: "/nfs/research/goldman/zihao/Datas/p2_comp_viridian/2_alignment/Aligned_split/"
colman version: "/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"
```
- **Output**: 
```python
output path: /nfs/research/goldman/zihao/Datas/p2_comp_viridian/3_combination
```
- **Address**:
```python
/nfs/research/goldman/zihao/compViridian_2/3_combination/pg_combination.py
```

### Code block:
###### Run the program
```bash
sh /nfs/research/goldman/zihao/compViridian_2/3_combination/bash_combination.sh
```

```python
import os
import pandas as pd
import numpy as np
import argparse

def read_sequence(file_path):
    '''This function reads a FASTA file and returns a pandas DataFrame where
    each row corresponds to the base and its position in the sequence'''
    
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sequence = ''.join(lines[1:]).replace('\n', '')
    sequence_list = [{'position': i+1, 'nucleotide': base} for i, base in enumerate(sequence)]

    return pd.DataFrame(sequence_list)


def process_batch(file_names, folder_path_1, folder_path_2, output_folder):
    for file_name in file_names:
        file_path_1 = os.path.join(folder_path_1, file_name)
        df1 = read_sequence(file_path_1)
        df1.rename(columns={'nucleotide': 'nucleotide_martin'}, inplace=True)

        file_path_2 = os.path.join(folder_path_2, file_name)

        if not os.path.exists(file_path_2):
            del df1
            continue

        df2 = read_sequence(file_path_2)
        df2.rename(columns={'nucleotide': 'nucleotide_origin'}, inplace=True)

        merged_df = pd.merge(df1, df2, on='position')

        merged_df['label_masked']=np.where((merged_df['nucleotide_martin'].isin(['-','n','m','r','w','s','y','k','v','h','d','b']))&(merged_df['nucleotide_origin'].isin(['-','n'])),1,0)
        merged_df['label_mar']=np.where(merged_df['nucleotide_martin'].isin(['-','n','m','r','w','s','y','k','v','h','d','b']),1,0)
        merged_df['label_ori']=np.where(merged_df['nucleotide_origin'].isin(['-','n']),1,0)
        merged_df['label_same']=np.where((merged_df['nucleotide_martin']==merged_df['nucleotide_origin']),
                                         np.where(merged_df['nucleotide_martin'].isin(['-','n','m','r','w','s','y','k','v','h','d','b']),2,1),
                                         np.where((merged_df['label_masked']==1)|(merged_df['label_mar']==1)|(merged_df['label_ori']==1),2,0))

        if os.path.exists(os.path.join(output_folder, file_name)):
            del df1
            del df2
            del merged_df
            continue

        merged_df.to_csv(os.path.join(output_folder, file_name), sep='\t', index=False)

        del df1
        del df2
        del merged_df


def main(output_folder, batch_num):
    folder_path_1="/nfs/research/goldman/zihao/Datas/p2_comp_viridian/2_alignment/Aligned_split/"
    folder_path_2="/nfs/research/goldman/zihao/Datas/p1/File_5_consensus/Decompress/Aligned_split_May/"

    os.makedirs(output_folder, exist_ok=True)

    file_names = os.listdir(folder_path_1)
    batches = [file_names[i:i + 5000] for i in range(0, len(file_names), 5000)]

    if batch_num < 1 or batch_num > len(batches):
        raise ValueError('batch_num is out of range. It should be between 1 and {}'.format(len(batches)))

    process_batch(batches[batch_num - 1], folder_path_1, folder_path_2, output_folder)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Process a batch of FASTA files.')
    parser.add_argument('output_folder', type=str, help='The output folder.')
    parser.add_argument('batch_num', type=int, help='The batch number to process.')
    args = parser.parse_args()

    main(args.output_folder, args.batch_num)
```