In [None]:
import pickle

# path to pickle file containing a dictionary with illumina data preprocessed
file_path = 'data/no_strand_all_points_dict.pickle'


with open(file_path, 'rb') as file:
    epi_signatures_all = pickle.load(file)

print(epi_signatures_all)

In [None]:
import os
import pandas as pd

def read_bed_files(input_path):
    bed_files = {}
    
    # Get a list of all files in the input folder
    files = os.listdir(input_path)
    sorted_files = sorted(files)
    # Iterate over each file and read the bedfiles 
    for file_name in sorted_files:

        if file_name.endswith('.bed'):
            file_path = os.path.join(input_path, file_name)
            
            # Read the contents of the bedmethyl file using pandas (first 3 columsn chr start end and methylation column 14th position)
            bed_df = pd.read_csv(file_path, sep='\t', header=None)
            bed_df = (bed_df[[0, 1, 2, 13]]).reset_index(drop=True)
            bed_df = bed_df.rename(columns={0: 'Chr', 1: 'Start', 2: 'End', 13 : 'Methylation'})
         
            file_basename = os.path.splitext(file_name)[0]   
            bed_files[file_basename] = bed_df
    
    return bed_files



    

In [None]:
input_folder_path = 'path/to/folder/with/beds'
result = read_bed_files(input_folder_path)

In [None]:
import numpy as np

for disorder in result:
        df = result[disorder]
        df['Methylation'] = df['Methylation'].replace('.', np.nan)
        df['Methylation'] = pd.to_numeric(df['Methylation'] , errors='coerce')
        df['Methylation'] = df['Methylation']/100
        result[disorder]=df

In [None]:
#link nanopore methylation data to the correct disorder
epi_signatures_all_with_sample = epi_signatures_all.copy()

epi_signatures_all_with_sample['MRXCJS'] = epi_signatures_all_with_sample['MRXSCJ']
del epi_signatures_all_with_sample['MRXSCJ']
for disorder in epi_signatures_all_with_sample:
    epi_signatures_all_with_sample[disorder] = epi_signatures_all_with_sample[disorder].iloc[:, 2:-1]
    for file_basename in result:
        print(file_basename)
        if file_basename.startswith(disorder):
            epi_signatures_all_with_sample[disorder] = epi_signatures_all_with_sample[disorder].assign(**{file_basename: list(result[file_basename]['Methylation'])})
            

In [None]:

#get number of samples that you read 
n_of_samples = len(epi_signatures_all_with_sample['Sotos'].columns[35:])

samples=epi_signatures_all_with_sample['Sotos'].columns[35:]
# use map to remove 'Sotos_' prefix from each string and retain only the sample names
samples = list(map(lambda col: col.replace('Sotos_', ''), samples))


In [None]:
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score


filter_df = pd.DataFrame()
pred_df = pd.DataFrame()
order_disorders = []

for disorder in epi_signatures_all_with_sample:
    if (disorder != 'Controls'):
        
        X = epi_signatures_all_with_sample[disorder].T
        y = X.index.tolist()

        if (disorder != 'MRXCJS'):
            new_y = [1 if item == str(disorder) else 0 for item in y]
        else:
            new_y = [1 if item == 'MRXSCJ' else 0 for item in y]

        # Convert 'NA' values to NaN and Fill NaN values with column averages
        X = X.replace('NA', np.nan)
        X = X.astype(float)
        X = X.fillna(X.mean())
        
        
        #take for training always the first 35 samples representing the illumina data
        #take for testing always our nanopore samples
        X_train = X.iloc[:-n_of_samples, :]
        X_test = X.iloc[-n_of_samples:, :]

        y_train = new_y[:-n_of_samples]
        y_test = new_y[-n_of_samples:]
        
        #weights for classes
        class_weights = {0: 1, 1: 10}  # Example weights, adjust as needed
        
        # Create SVM classifiers 
        linear_classifier = svm.SVC(kernel='linear', class_weight=class_weights)
        

        #Train the classifiers for the current disorder
        linear_classifier.fit(X_train, y_train)


        # Make predictions using the trained classifiers
        linear_pred = linear_classifier.predict(X_test)
       
        #store the prediction results of every SVM
        new_row_df = pd.DataFrame([linear_pred])
        pred_df =  pd.concat([pred_df,new_row_df], ignore_index=True)
        #store the order in which the disorder specific SVMs were trained and tested
        order_disorders.append(disorder)
        
        #store decision function value for every SVM - this will be used to determine the samples correct class
        decision_values = linear_classifier.decision_function(X_test)
        new_row_df = pd.DataFrame([decision_values])
        filter_df = pd.concat([filter_df,new_row_df], ignore_index=True)
        


In [None]:
filter_df.index = order_disorders
filter_df.columns = samples


In [None]:
# Find the maximum value and corresponding row name of column 'A' using idxmax() 
#highest confidence score (decision function value) for the classification of every sample
for sample in range(0,n_of_samples):
    max_value = filter_df.iloc[:,sample].max()
    row_name = filter_df.iloc[:,sample].idxmax()

    # Print the maximum value and row name
    print( samples[sample])
    if (max_value == -1000 or max_value < 0.35):
        row_name = 'Control'
    print('Max value:', max_value, 'Assigned Class:', row_name)
    print("")