- Save to {processed_data_cov.txt}

## TODO
- add comment

In [None]:
import pandas as pd

# Data Processing

In [54]:
df = pd.read_csv("TEST_1/EPI_ISL_1080221_coverage.txt", sep='\t')

df.head()

Unnamed: 0,Position,Coverage,RATIO
0,1,0,0.0
1,2,0,0.0
2,3,0,0.0
3,4,0,0.0
4,5,0,0.0


In [25]:
import os

class DataProcessor:
    def __init__(self, data_file_path, directory_to_search, output_file_path):
        self.data_file_path = data_file_path
        self.directory_to_search = directory_to_search
        self.output_file_path = output_file_path

    @staticmethod
    def search_position_value_and_get_column_4(file_path, id_position):
        with open(file_path, "r") as f:
            file_header = f.readline().strip().split("\t")
            position_column_index = file_header.index("Position")
            column_4_index = 2

            for line in f:
                line_values = line.strip().split("\t")
                if position_column_index < len(line_values):  # Check if index is within range
                    position_value = int(line_values[position_column_index])
                    if position_value == id_position and column_4_index < len(line_values):
                        return line_values[column_4_index]
        return None

    @staticmethod
    def calculate_mean_error(data):
        position_error_sum = {}
        position_count = {}

        for line in data:
            position = line[1]
            mean_error = line[-1]

            if mean_error != "None":
                mean_error = float(mean_error)

                if position in position_error_sum:
                    position_error_sum[position] += mean_error
                    position_count[position] += 1
                else:
                    position_error_sum[position] = mean_error
                    position_count[position] = 1

        mean_error_by_position = {}
        for position in position_error_sum:
            mean_error_by_position[position] = position_error_sum[position] / position_count[position]

        return mean_error_by_position

    def process_data(self):
        data = []

        with open(self.data_file_path, "r") as f:
            header = f.readline().strip().split("\t")
            header.append("MEAN_err")

            for line in f:
                data.append(line.strip().split("\t") + [None])

        for line in data:
            id_now = str(line[0])
            id_position = line[1]

            if id_position is not None:
                for filename in os.listdir(self.directory_to_search):
                    if id_now in filename and filename.endswith(".txt"):
                        file_path = os.path.join(self.directory_to_search, filename)
                        result = self.search_position_value_and_get_column_4(file_path, int(id_position))
                        if result is not None:
                            line[-1] = result
                            break

        # Remove rows with "None" or None in the MEAN_err column
        data = [line for line in data if line[-1] is not None and line[-1] != "None"]

        # Calculate the mean error for each position
        mean_error_by_position = self.calculate_mean_error(data)

        # Write the result to the output file
        with open(self.output_file_path, "w") as f:
            f.write("Position\tMEAN_err\n")
            for position, mean_error in mean_error_by_position.items():
                f.write(f"{position}\t{mean_error}\n")

In [27]:
if __name__ == '__main__':
    directory_to_search = "/homes/zihao/DATAS/TEST/"
    data_file_path = "/homes/zihao/EBI_INTER/Project_1/output_modified.txt"
    output_file_path = "processed_data_cov.txt"

    processor = DataProcessor(data_file_path, directory_to_search, output_file_path)
    processor.process_data()

## Old version_MAY:
```python
import os
import numpy as np

def search_position_value_and_get_column_4(file_path, id_position):
    with open(file_path, "r") as f:
        file_header = f.readline().strip().split("\t")
        position_column_index = file_header.index("Position")
        column_4_index = 2

        for line in f:
            line_values = line.strip().split("\t")
            if position_column_index < len(line_values): # Check if index is within range
                position_value = int(line_values[position_column_index])
                if position_value == id_position and column_4_index < len(line_values):
                    return line_values[column_4_index]
    return None

def calculate_mean_error(data):
    position_error_sum = {}
    position_count = {}

    for line in data:
        position = line[1]
        mean_error = line[-1]

        if mean_error != "None":
            mean_error = float(mean_error)

            if position in position_error_sum:
                position_error_sum[position] += mean_error
                position_count[position] += 1
            else:
                position_error_sum[position] = mean_error
                position_count[position] = 1

    mean_error_by_position = {}
    for position in position_error_sum:
        mean_error_by_position[position] = position_error_sum[position] / position_count[position]

    return mean_error_by_position

def process_data(data_file_path, directory_to_search, output_file_path):
    data = []

    with open(data_file_path, "r") as f:
        header = f.readline().strip().split("\t")
        header.append("MEAN_err")

        for line in f:
            data.append(line.strip().split("\t") + [None])

    for line in data:
        id_now = str(line[0])
        id_position = line[1]

        if id_position is not None:
            for filename in os.listdir(directory_to_search):
                if id_now in filename and filename.endswith(".txt"):
                    file_path = os.path.join(directory_to_search, filename)
                    result = search_position_value_and_get_column_4(file_path, int(id_position))
                    if result is not None:
                        line[-1] = result
                        break

    # Remove rows with "None" or None in the MEAN_err column
    data = [line for line in data if line[-1] is not None and line[-1] != "None"]

    # Calculate the mean error for each position
    mean_error_by_position = calculate_mean_error(data)

    # Write the result to the output file
    with open(output_file_path, "w") as f:
        f.write("Position\tMEAN_err\n")
        for position, mean_error in mean_error_by_position.items():
            f.write(f"{position}\t{mean_error}\n")
            
directory_to_search = "/homes/zihao/DATAS/TEST_1/"
data_file_path = "output_modified.txt"
output_file_path = "processed_data_new.txt"

process_data(data_file_path, directory_to_search, output_file_path)
```

## Old version_APR:
```python
import os
import pandas as pd
import numpy as np

def search_position_value_and_get_column_4(file_path, id_position):
    with open(file_path, "r") as f:
        file_header = f.readline().strip().split("\t")
        position_column_index = file_header.index("Position")
        column_4_index = 2

        for line in f:
            line_values = line.strip().split("\t")
            if position_column_index < len(line_values): # Check if index is within range
                position_value = int(line_values[position_column_index])
                if position_value == id_position and column_4_index < len(line_values):
                    return line_values[column_4_index]
    return None


def process_data(data_file_path, directory_to_search, output_file_path):
    data = []

    with open(data_file_path, "r") as f:
        header = f.readline().strip().split("\t")
        header.append("MEAN_err")

        for line in f:
            data.append(line.strip().split("\t") + [None])

    for line in data:
        id_now = str(line[0])
        id_position = line[1]

        if id_position is not None:
            for filename in os.listdir(directory_to_search):
                if id_now in filename and filename.endswith(".txt"):
                    file_path = os.path.join(directory_to_search, filename)
                    result = search_position_value_and_get_column_4(file_path, int(id_position))
                    if result is not None:
                        line[-1] = result
                        break

    with open(output_file_path, "w") as f:
        f.write("\t".join(header) + "\n")
        for line in data:
            f.write("\t".join(str(x) for x in line) + "\n")

directory_to_search = "/homes/zihao/DATAS/TEST_1/"
data_file_path = "output_modified.txt"
output_file_path = "processed_data_1.txt"

process_data(data_file_path, directory_to_search, output_file_path)

### Step 2
# Handling files without corresponding IDs
df = pd.read_csv("processed_data_1.txt", sep='\t')
## Delete rows where RATIO is None
df = df.replace("None", np.nan)
df = df.dropna(subset=["MEAN_err"])

## Calculate the mean from the position
df["MEAN_err"] = df["MEAN_err"].astype(float)
df_err = df.groupby(by=["Position"], as_index=False).mean(numeric_only=True)
df_err.to_csv("processed_data_1.txt", sep='\t',index=False)
```