# MethylScore output filtering 

In [1]:
# import packages 
import os
import awkward as ak
import shutil

## 1) Filter for samples are the same in each cluster 
Since the bed files have different number of column for each row, the dataframe can't be imported with pandas. 

I open the dataframe and tell it to ignore the first 5 and last 3 column. Those are the one that are uniform throughout the dataframe. I am left with the cluster (numbers) and the cluster list (samples are listed). Hence I calculate the number of elements and only keep half of that (only the listed samples), through which I filter. First, I eliminate the e.g. "1:" or "2:" and after split all samples in individual elements (split by ","). 
Then I created a for loop, to go through each cluster (sample e.g. "A2") and extract the condition e.g. "A". If the condition ("A") is the same with all the samples that the cluster contains the keep_line=True and it goes on to the next cluster. If all the clusters in one row are True it will append the line ("row") to the "matched_clusters.txt" file. 

In [2]:
def process_input_file(input_file_path):
    matched_lines = []

    with open(input_file_path, 'r') as file:
        for line in file:
            # Split the line into columns
            columns = line.strip().split('\t')

            # Extract the identified clusters column
            clusters = columns[5:-3]
            num_elements = len(clusters)
            last_half = num_elements // 2
            clusters = clusters[-last_half:]
            clusters = [cluster.split(':')[1] for cluster in clusters]
            clusters = [cluster.split(',') for cluster in clusters]
            keep_line = True 
            for cluster in clusters: 
                condition = cluster[0][0] 
                for c in cluster:
                    if c[0] != condition:
                        keep_line = False
                        break
                if keep_line == False:
                    break
            if keep_line: 
                matched_lines.append(line)

    # Determine the output file path based on the input file path
    output_file_path = os.path.join(os.path.dirname(input_file_path), '1_filtered_clusters.txt')

    # Write the matched rows to the output file
    with open(output_file_path, 'w') as outfile:
        outfile.writelines(matched_lines)

# List of input file paths
input_files = ['6_DMRsCG/1_NvsA/DMRs.CG.bed', '6_DMRsCG/2_NvsR/DMRs.CG.bed', '6_DMRsCG/3_AvsR/DMRs.CG.bed']

# Process each input file
for input_file in input_files:
    process_input_file(input_file)


# 2) Filter for biologically useful results
Since some contain clusters that only contain 2 samples in total this is not biologically relevant. I only want to keep comparisons that are established by a minimum of 6 samples. Hence the following filtering step: 

In [3]:
def process_input_file(input_file_path):
    # Determine the output file path based on the input file path
    output_file_path = os.path.join(os.path.dirname(input_file_path), '2_filtered_min_6samples.txt')

    # Reading data from the input file and converting it to an awkward array
    array = []
    with open(input_file_path, 'r') as file:
        for line in file:
            columns = line.strip().split('\t')  # Split the line into columns
            array.append(columns)
    df = ak.from_iter(array)

    # Accessing the 5th column
    column_5 = df[:, 4]

    # Calculating the number counts for each element
    number_counts = [sum(nb.isdigit() for nb in item) for item in column_5]

    # Filter for rows with number count of 6 or higher
    filtered_indices = [i for i, count in enumerate(number_counts) if count >= 6]

    # Writing the filtered rows to the output file
    with open(output_file_path, 'w') as outfile:
        for idx in filtered_indices:
            row = '\t'.join(str(x) for x in array[idx])
            outfile.write(row + '\n')

# List of input file paths
input_files = ['6_DMRsCG/1_NvsA/1_filtered_clusters.txt', '6_DMRsCG/2_NvsR/1_filtered_clusters.txt', '6_DMRsCG/3_AvsR/1_filtered_clusters.txt']

# Process each input file
for input_file in input_files:
    process_input_file(input_file)


# 3) Filter for significantly differentially methylated 

In [4]:
def process_input_file(input_file_path):
    # Determine the output file path based on the input file path
    output_file_path = os.path.join(os.path.dirname(input_file_path), '3_sig_diff_meth.txt')

    # Reading data from the input file and converting it to an awkward array
    array = []
    with open(input_file_path, 'r') as file:
        for line in file:
            # Split the line into columns
            columns = line.strip().split('\t')
            array.append(columns)

    df = ak.from_iter(array)

    # Retrieve the last column as a list
    last_col = df[:, -1].tolist()

    # Filter the rows for "CG" in the last column
    filtered_indices = [i for i, col in enumerate(last_col) if "CG" in col]

    # Get the filtered rows
    filtered_rows = df[filtered_indices]

    # Write the filtered rows to the output file
    with open(output_file_path, 'w') as outfile:
        for row in filtered_rows:
            row_values = '\t'.join(str(x) for x in row)
            outfile.write(row_values + '\n')

# List of input file paths
input_files = ['6_DMRsCG/1_NvsA/2_filtered_min_6samples.txt', '6_DMRsCG/2_NvsR/2_filtered_min_6samples.txt', '6_DMRsCG/3_AvsR/2_filtered_min_6samples.txt']

# Process each input file
for input_file in input_files:
    process_input_file(input_file)


Create a copy of the filtered_min6samples files and name them corresponding to the condition comparison that they were created for

In [5]:

# List of input file paths
input_files = ['6_DMRsCG/1_NvsA/2_filtered_min_6samples.txt', '6_DMRsCG/2_NvsR/2_filtered_min_6samples.txt', '6_DMRsCG/3_AvsR/2_filtered_min_6samples.txt']

# Process each input file
for input_file in input_files:
    # Get the folder name from the input file path
    folder_name = os.path.dirname(input_file)

    # Generate the destination file path with the new name and .txt extension
    new_file_name = os.path.basename(folder_name) + ".txt"
    destination_file_path = os.path.join(folder_name, new_file_name)

    # Create a copy of the input file with the new name and .txt extension
    shutil.copyfile(input_file, destination_file_path)
