# Treat_error_positions

***
### 1. Decompress and save coverage [Done]

- **Idea**: Decompress the coverage file and save it after processing (calculate the **RATIO**[NB_COVERAGE/MEAN_nb_coverage])
```python
    lines = [line for line in f if not line.startswith('##')]
    data = [line.strip().split('\t') for line in lines[0:]]
    df = pd.DataFrame(data)
    df[['Position', 'N', 'Coverage']] = df.iloc[:, 0].str.split(',', expand=True)
    df = df.drop(df.columns[[0, 2]], axis=1)
    df['SUM'] = df['Coverage'].astype(int).sum()
    df['MEAN'] = df['SUM']/len(df)
    df['RATIO'] = df['Coverage'].astype(int)/df['MEAN'].astype(int)
    df = df.drop(['SUM','MEAN'], axis=1)
```
- **Input**:Downloaded files
- **Output**:
```python
output_directories = ['/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Decompress/*_coverage.txt']
```

##### Code block:
```bash
bsub -M 2000
-e /nfs/research/goldman/zihao/errorsProject_1/Coverage/Decompress_errorChecking_error.txt 'python3 /nfs/research/goldman/zihao/errorsProject_1/Coverage/python3 0_Apr.21_Decompress_and_save.py'
```

---
### 2. Handle with output.ipynb

```python
import os
import pandas as pd

file_path = "/nfs/research/goldman/zihao/errorsProject_1/MAPLE/MapleRealErrorsVariation_errorEstimation_estimatedErrors.txt" ### !!!Needs to be modified!!!
output_folder = "/homes/zihao/P1/" ### !!!Needs to be modified!!!

# Open the output file to write the processed data
with open(os.path.join(output_folder, "output_modified.txt"), "w") as output_file:
    # Write the column headers to the output file
    output_file.write("ID\tPosition\n")
    
    # Initialize the current_id variable to None
    current_id = None
    
    # Iterate over each line in the file and convert it to a list
    with open(file_path, "r") as input_file:
        for line in input_file:
            if line.startswith(">"):
                # Update current_id if a new ID is encountered
                current_id = line[1:].strip()
            else:
                tokens = line.strip().split()
                position = int(tokens[0])
                base = tokens[1]
                percentage = float(tokens[2])

                # Check if percentage is less than 0.5, and if so, skip adding it to the processed data
                if percentage < 0.5:
                    continue

                # Write the processed data to the output file
                output_file.write(f"{current_id}\t{position}\n")
                
                # Delete variables to free up memory
                del tokens, base, position, percentage
                
    # Delete the current_id variable after the loop has finished
    del current_id

print(f"Processing complete and file written to {output_folder}.")
```

- **Idea**: Collate the output format and remove those with an error rate less than 0.5
- **Input**: MapleRealErrorsVariation_errorEstimation_estimatedErrors.txt
- **Output**: output_modified.txt

***
### 3. Data Processing.ipynb

```python
import pandas as pd
import os

def search_position_value_and_get_column_4(file_path, id_position):
    with open(file_path, "r") as f:
        file_header = f.readline().strip().split("\t")
        position_column_index = file_header.index("Position")
        column_4_index = 2

        for line in f:
            line_values = line.strip().split("\t")
            if position_column_index < len(line_values): # Check if index is within range
                position_value = int(line_values[position_column_index])
                if position_value == id_position and column_4_index < len(line_values):
                    return line_values[column_4_index]
    return None


def process_data(data_file_path, directory_to_search, output_file_path):
    data = []

    with open(data_file_path, "r") as f:
        header = f.readline().strip().split("\t")
        header.append("MEAN_err")

        for line in f:
            data.append(line.strip().split("\t") + [None])

    for line in data:
        id_now = str(line[0])
        id_position = line[1]

        if id_position is not None:
            for filename in os.listdir(directory_to_search):
                if id_now in filename and filename.endswith(".txt"):
                    file_path = os.path.join(directory_to_search, filename)
                    result = search_position_value_and_get_column_4(file_path, int(id_position))
                    if result is not None:
                        line[-1] = result
                        break

    with open(output_file_path, "w") as f:
        f.write("\t".join(header) + "\n")
        for line in data:
            f.write("\t".join(str(x) for x in line) + "\n")
```

```python
directory_to_search = "/homes/zihao/P1/TEST_1/" ### !!!Needs to be modified!!!
data_file_path = "output_modified.txt" ### !!!Needs to be modified!!!
output_file_path = "processed_data.txt" ### !!!Needs to be modified!!!

process_data(data_file_path, directory_to_search, output_file_path)

# Handling files without corresponding IDs
df = pd.read_csv(output_file_path, sep='\t')
## Delete rows where RATIO is None
df = df.replace("None", np.nan)
df = df.dropna(subset=["MEAN_err"])

## Calculate the mean from the position
df["MEAN_err"] = df["MEAN_err"].astype(float)
df_err = df.groupby(by=["Position"], as_index=False).mean(numeric_only=True)
df_err.to_csv(output_file_path, sep='\t')
```

- **Idea**: Find the coverage ratio by ID as primary key and return to add it to the input file
- **Input**: output_modified.txt
- **Output**: processed_data.txt

***
***
# Treat_all_positions

#### Eliminate IDs that do not correspond to coverage data
```python
import os
import shutil
import pandas as pd

# Load a list of IDs from a file
df_err = pd.read_csv("processed_data.txt", sep='\t') ### !!!Needs to be modified!!!
id_set = set(df_err['ID'])

# Set the folder path
folder_path = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Decompress/'

# Create a new directory for the copied files
if not os.path.exists(folder_path + '../AAA'):
    os.makedirs(folder_path + '../AAA')

# Copy files that match the IDs to the new directory
for filename in os.listdir(folder_path):
    # Check if the filename contains any ID in the set
    if any(id_str in filename for id_str in id_set):
        # Copy the file to the new directory
        shutil.copy(os.path.join(folder_path, filename), os.path.join(folder_path, '../AAA', filename))

```

```python
import os
import glob
import pandas as pd
import numpy as np

# Set folder path
folder_path = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/AAA/'

# Get all files in the folder
files = glob.glob(os.path.join(folder_path, '*'))

# Read the first file
df = pd.read_csv(files[0], sep='\t')
ratios = df['RATIO'].tolist()

# Set the output file name
output_file = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/processed_data_all.txt'  ### !!!Needs to be modified!!!

# Write the average value to a txt file
with open(output_file, 'w') as f:
    # Write the header
    f.write('MEAN_pos\n')

    # Write the average value of the first file
    f.write('\n'.join(map(str, ratios)))
    f.write('\n')

    # Delete the ratios variable from memory
    del ratios

    # Loop through the remaining files
    for file in files[1:]:
        try:
            # Read the file using pandas
            df_new = pd.read_csv(file, sep='\t')

            # Calculate the average value of RATIO in each file
            ratios_new = df_new['RATIO'].tolist()
            ratios = np.mean([ratios, ratios_new], axis=0)

            # Write the average value to the txt file (overwriting previous content)
            with open(output_file, 'w') as f_out:
                # Write the header
                f_out.write('MEAN_pos\n')

                # Write the current average value
                f_out.write('\n'.join(map(str, ratios)))
                f_out.write('\n')

            # Delete the ratios variable from memory
            del ratios

        except:
            # If there is an error, skip the current file
            pass
```      

- **Idea**: 
- **Input**: 
- **Output**: processed_data_all.txt

# Visualization

```python

import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd
from plotly.offline import iplot, plot

df_err = pd.read_csv("processed_data.txt", sep='\t')  ### !!!Needs to be modified!!!
df_all = pd.read_csv('/homes/zihao/P1/processed_data_all.txt', sep='\t')  ### !!!Needs to be modified!!!
 

# Visualization
hist_data = [df_err['MEAN_err'].tolist(), df_all['MEAN_pos'].tolist()]
group_labels = ['error position', 'all position'] # name of the dataset
colors = ['#7FA6EE', '#B8F7D4']

fig = ff.create_distplot(hist_data, group_labels, curve_type='normal', colors=colors, bin_size=.5)

fig.update_layout(
    title="Blank for now",
    yaxis=dict(
        title='Frequency',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    
    xaxis=dict(
        title='The ratio of coverage vs mean coverage',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),

    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)
# fig.show()
plot(fig, filename='my_plot.html') ### !!!Needs to be modified!!!
```