In [2]:
import pandas as pd
#updated_methylation_data_illumina_nanopore_samples_controls supplementary_table
#read the data from the supplementary table provided
file_path = 'data/methylation_data_illumina_nanopore_samples_controls.xlsx'
all_sheets_dict = pd.read_excel(file_path, sheet_name=None, engine='openpyxl')

sheet_names = list(all_sheets_dict.keys())

#remove the first sheet containing information about data and data description
if sheet_names:  
    del all_sheets_dict[sheet_names[0]]




In [3]:
# Drop the first two columns from every dataframe (Chr and Pos)
for sheet_name, df in all_sheets_dict.items():
    updated_df = df.iloc[:, 2:]
    all_sheets_dict[sheet_name] = updated_df



In [4]:
epi_signatures_all_with_sample=all_sheets_dict.copy()


In [20]:
#extract the sample names (first 35 represent illumina data, last 30 are our samples), 
#can be extracted from any of the disorders in this case we extracted it from sotos 
samples = epi_signatures_all_with_sample['Sotos'].columns[35:]
n_of_samples = len(samples)

In [21]:
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score


filter_df = pd.DataFrame()
pred_df = pd.DataFrame()
order_disorders = []
probability_df = {}
for disorder in epi_signatures_all_with_sample:
    if (disorder != 'Controls'):
        
        X = epi_signatures_all_with_sample[disorder].T
        y = X.index.tolist()

        if (disorder != 'MRXCJS'):
            new_y = [1 if item == str(disorder) else 0 for item in y]
        else:
            new_y = [1 if item == 'MRXSCJ' else 0 for item in y]

        # Convert 'NA' values to NaN and Fill NaN values with column averages
        X = X.replace('NA', np.nan)
        X = X.astype(float)
        X = X.fillna(X.mean())
        
        
        #take for training always the first 35 samples representing the illumina data
        #take for testing always our nanopore samples
        X_train = X.iloc[:-n_of_samples, :]
        X_test = X.iloc[-n_of_samples:, :]

        y_train = new_y[:-n_of_samples]
        y_test = new_y[-n_of_samples:]
        
        #weights for classes
        class_weights = {0: 1, 1: 10}  # Example weights, adjust as needed
        
        # Create SVM classifiers 
        linear_classifier = svm.SVC(kernel='linear', class_weight=class_weights)
        

        #Train the classifiers for the current disorder
        linear_classifier.fit(X_train, y_train)


        # Make predictions using the trained classifiers
        linear_pred = linear_classifier.predict(X_test)
       
        #store the prediction results of every SVM
        new_row_df = pd.DataFrame([linear_pred])
        pred_df =  pd.concat([pred_df,new_row_df], ignore_index=True)
        #store the order in which the disorder specific SVMs were trained and tested
        order_disorders.append(disorder)
        
        #store decision function value for every SVM - this will be used to determine the samples correct class
        decision_values = linear_classifier.decision_function(X_test)
        new_row_df = pd.DataFrame([decision_values])
        filter_df = pd.concat([filter_df,new_row_df], ignore_index=True)
        


In [22]:
filter_df.index = order_disorders
filter_df.columns = samples


In [23]:
# Find the maximum value and corresponding row name of column 'A' using idxmax() 
#highest confidence score (decision function value) for the classification of every sample
for sample in range(0,n_of_samples):
    max_value = filter_df.iloc[:,sample].max()
    row_name = filter_df.iloc[:,sample].idxmax()

    # Print the maximum value and row name
    print( samples[sample])
    if (max_value == -1000 or max_value < 0.35):
        row_name = 'Control'
    print('Max value:', max_value, 'Assigned Class:', row_name)
    print("")

EPI_02
Max value: 1.7448036456753149 Assigned Class: Kabuki

EPI_12
Max value: 2.6949779584364766 Assigned Class: BAFopathy

EPI_01
Max value: 1.9670317715684396 Assigned Class: Kabuki

EPI_04
Max value: 1.7015318023465773 Assigned Class: Sotos

EPI_09
Max value: 1.5981773952390732 Assigned Class: Sotos

EPI_11
Max value: 1.195381005727954 Assigned Class: CdLS

EPI_03
Max value: 0.8956162497033491 Assigned Class: Sotos

EPI_05
Max value: 1.6270706700424018 Assigned Class: WDSTS

EPI_07
Max value: 0.6518481493884436 Assigned Class: CdLS

EPI_17
Max value: 0.6804077585634687 Assigned Class: RSTS

EPI_08
Max value: 2.2814865114554372 Assigned Class: Kabuki

EPI_06
Max value: 0.6120176138137641 Assigned Class: WDSTS

EPI_14
Max value: 0.7713233728515383 Assigned Class: KDVS

EPI_16
Max value: 1.1395882463732487 Assigned Class: Williams

EPI_10
Max value: 0.704043421336225 Assigned Class: WDSTS

EPI_13
Max value: -0.5125794806589321 Assigned Class: Control

EPI_15
Max value: 0.8362412884543