# MethylScore output filtering 

In [1]:
# import packages 
import os
import awkward as ak
import shutil
import pandas as pd 
import numpy as np

## 1) Filter for samples are the same in each cluster 
Since the bed files have different number of column for each row, the dataframe can't be imported with pandas. 



I open the dataframe and tell it to ignore the first 5 and last 3 column. Those are the one that are uniform throughout the dataframe. I am left with the cluster (numbers) and the cluster list (samples are listed). Hence I calculate the number of elements and only keep half of that (only the listed samples), through which I filter. First, I eliminate the e.g. "1:" or "2:" and after split all samples in individual elements (split by ","). 
Then I created a for loop, to go through each cluster (sample e.g. "A2") and extract the condition e.g. "A". If the condition ("A") is the same with all the samples that the cluster contains the keep_line=True and it goes on to the next cluster. If all the clusters in one row are True it will append the line ("row") to the "matched_clusters.txt" file. 

In [2]:
def process_input_file(input_file_path):
    matched_lines = []

    with open(input_file_path, 'r') as file:
        for line in file:
            # Split the line into columns
            columns = line.strip().split('\t')
            # get numbers of clusters 
            sample_cluster = [int(i) for i in columns[4] if i.isdigit()]
            # if there are less than 6 sample clusters discard that line 
            if len(sample_cluster) < 6:
                continue
            # calculate how many cluster are in that line 
            n_clusters = max(sample_cluster)
            # methylation information 
            meth = {int(cluster.split(':')[0]): cluster.split(':')[1] for cluster in columns[5:5+n_clusters]}
            # convert to integer and if no methylation percentage write None 
            meth = {k:int(v) if v != '' else np.nan for k,v in meth.items()}
            # get samples in each cluster
            samples = {int(cluster.split(':')[0]): cluster.split(':')[1] for cluster in columns[5+n_clusters:5+2*n_clusters]}
            samples = {k: v.split(',') for k, v in samples.items()}
            # convert to df 
            data = pd.DataFrame([meth, samples], index=['methylation', 'samples']).T
            # how many samples are in each cluster 
            data['samples_per_cluster'] = data['samples'].apply(len)
            # check if samples in cluster belong to one condition 
            for i, names in data['samples'].items():
                data.loc[i,'same_cond'] = np.all([name.startswith(names[0][0]) for name in names])
            # if the sample in one cluster are not the same dircard that line
            if not data['same_cond'].all():
                continue
            # filter so that min 6 samples are represented in data  
            data_ex = data.explode('samples').dropna()
            # 6 relevant samples 
            if len(data_ex) < 6:
                continue
            data_ex['condition'] = data_ex['samples'].str[0]
            # clauclate methylation average for each condition
            average_methylation = data_ex[['methylation','condition']].groupby('condition').mean()
            # if there are less than 2 conditions discard that line
            if len(average_methylation) < 2:
                continue
            data_ex[['methylation','condition']].groupby('condition').mean()     
            # if we have multiple comparisons for one dmr only keep the last line (with the most clusters) 
            matched_lines.append(';'.join(columns[:3])+';'+'\t'.join(columns[3:])+f';{n_clusters}\n')

        
    # Determine the output file path based on the input file path
    output_file_path = os.path.join(os.path.dirname(input_file_path), f'{input_file_path[11:15]}.txt')

    # Write the matched rows to the output file
    with open(output_file_path, 'w') as outfile:
        outfile.writelines(matched_lines)

    df = pd.read_csv(output_file_path, sep=';', header=None)
    df = df.sort_values(by=[0,1,2,4], ascending=True)
    df.drop_duplicates(subset=[0,1,2], keep='last', inplace=True)
    df.drop(columns=4, inplace=True)
    df.to_csv(output_file_path, sep='\t', header=False, index=False, quotechar=' ')


# List of input file paths
input_files = ['6_DMRsCG/1_NvsA/DMRs.CG.bed', '6_DMRsCG/2_NvsR/DMRs.CG.bed', '6_DMRsCG/3_AvsR/DMRs.CG.bed']

# Process each input file
for input_file in input_files:
    process_input_file(input_file)
    

