In [None]:
# machine learning m6A RNA

In [13]:
import numpy as np
import pandas as pd
import pickle
import h5py
import random
from tombo import tombo_helper
from tqdm.notebook import tqdm

In [2]:
# fast5 file paths
fast5s = {'unm': '/Volumes/DATA_3/190926_DrKuroyanagi_sams_345/190926_DrKuroyanagi_sams_345/20190926_0543_MN17290_FAK89523_0ee12f5a/fast5_pass_single_unm',
          'm6A': '/Volumes/DATA_3/190926_DrKuroyanagi_sams_345/190926_DrKuroyanagi_sams_345/20190926_0543_MN17290_FAK89523_0ee12f5a/fast5_pass_single_m6A',
          'vivo': '/Volumes/DATA_2/sams/MinION/20190326_0351_190326_DrKuroyanagi_w12457/RAW/fast5',
          'unm long': '/Volumes/DATA_2/sams/MinION/20190326_0350_190326_DrKuroyanagi_sams3_sams4/RAW/fast5_filtered'
          }

# range
region = 50

In [3]:
# m6A sites
plotSites = {'sams-3b': 1262, 'sams-3c': 1538, 'sams-4b': 1265, 'sams-4c': 1388, 'sams-4d': 1545, 'sams-5b': 1315}

# tombo groups
tombo_groups = {'unm': 'RawGenomeCorrected_sams3b5',
                'm6A': 'RawGenomeCorrected_sams3b5',
                'vivo': 'RawGenomeCorrected_sams3abc4bdef5',
                'unm long': 'RawGenomeCorrected_sams3abc4bdef5'
                }


In [15]:
# extract current data

# each sample
# container
current = pd.DataFrame()
for sample_name, fast5_path in fast5s.items():
    
    # load tombo-annotated reads
    tmb = tombo_helper.TomboReads([fast5_path], corrected_group=tombo_groups[sample_name])
    
    
    # each transcript
    df2 = pd.DataFrame()
    for transcript, position in plotSites.items():
        
        # print now
        print(transcript + ' in ' + sample_name)
        
        # get reads on a sams gene
        reads = tmb.get_cs_reads(chrm=transcript, strand='+')
        
        # shuffle reads
        random.shuffle(reads)


        # each read
        df1 = []
        for read in tqdm(reads, desc='reads', leave=False):
            
            # target region
            plotStart = position - read.start - region - 1
            plotEnd = position - read.start + region
            
            # check read length
            if plotStart < 0 or read.end < (position + region):
                
                continue
                
            # get current data
            path = read.fn
            f5 = h5py.File(path, 'r')
            
            mean = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_mean'][plotStart:plotEnd]
            stdev = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_stdev'][plotStart:plotEnd]
            duration = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['length'][plotStart:plotEnd]
            
            df1.append(np.concatenate([mean, stdev, duration]))
            
            
        df1 = pd.DataFrame(df1)

        # label transcript
        df1['sams'] = transcript
        
        df2 = df2.append(df1)
        
    
    # label sample
    df2['RNA'] = sample_name
    
    current = current.append(df2)

[07:00:38] Parsing Tombo index file(s).


sams-3b in unm


HBox(children=(IntProgress(value=0, description='reads', max=7954, style=ProgressStyle(description_width='init…

sams-3c in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4b in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4c in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4d in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-5b in unm


HBox(children=(IntProgress(value=0, description='reads', max=13736, style=ProgressStyle(description_width='ini…

[07:03:23] Parsing Tombo index file(s).


sams-3b in m6A


HBox(children=(IntProgress(value=0, description='reads', max=3295, style=ProgressStyle(description_width='init…

sams-3c in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4b in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4c in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4d in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-5b in m6A


HBox(children=(IntProgress(value=0, description='reads', max=5376, style=ProgressStyle(description_width='init…

[07:05:09] Parsing Tombo index file(s).


sams-3b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=110, style=ProgressStyle(description_width='initi…

sams-3c in vivo


HBox(children=(IntProgress(value=0, description='reads', max=29, style=ProgressStyle(description_width='initia…

sams-4b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=49, style=ProgressStyle(description_width='initia…

sams-4c in vivo


HBox(children=(IntProgress(value=0, description='reads', max=58, style=ProgressStyle(description_width='initia…

sams-4d in vivo


HBox(children=(IntProgress(value=0, description='reads', max=32, style=ProgressStyle(description_width='initia…

sams-5b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=18, style=ProgressStyle(description_width='initia…

[07:05:26] Parsing Tombo index file(s).


sams-3b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=128267, style=ProgressStyle(description_width='in…

sams-3c in unm long


HBox(children=(IntProgress(value=0, description='reads', max=58852, style=ProgressStyle(description_width='ini…

sams-4b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=53515, style=ProgressStyle(description_width='ini…

sams-4c in unm long


HBox(children=(IntProgress(value=0, description='reads', max=53504, style=ProgressStyle(description_width='ini…

sams-4d in unm long


HBox(children=(IntProgress(value=0, description='reads', max=59187, style=ProgressStyle(description_width='ini…

sams-5b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=5, style=ProgressStyle(description_width='initial…

In [16]:
# save current data
with open('fast5_current_m6A_sams-345_100nt.pickle', 'wb') as f:
    pickle.dump(current, f)
print(current.shape)

(373772, 305)


In [None]:
# save csv
current.to_csv('fast5_current_m6A_sams-345_100nt.csv')

# Load Nanopore data

In [1]:
# packages
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Tuning by TPE
from hyperopt import hp, tpe, Trials, fmin, space_eval

In [2]:
# load current data
with open('fast5_current_m6A_sams-345_100nt.pickle', 'rb') as f:
    current = pickle.load(f)
print(current.shape)

(373772, 305)


In [3]:
# data set for sams-3b+5b
# set range
length = range(0,303)
df = current[((current['sams'] == 'sams-3b') | (current['sams'] == 'sams-5b')) & ((current['RNA'] == 'unm') | (current['RNA'] == 'm6A'))]

# unm or m6A
Y = df['RNA']
Y = Y.str.replace('unm','0').str.replace('m6A','1').values
Y = np.array(list(map(int, Y)))

# current
X = df.iloc[:,length].values

# training set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

# downsampling
X_train_m6A = X_train[np.where(Y_train == 1)]
X_train_unm = X_train[np.where(Y_train == 0)]
X_train_unm = X_train_unm[np.random.choice(len(X_train_unm), len(X_train_m6A), replace=False)]
X_train = np.concatenate([X_train_unm, X_train_m6A])
Y_train = np.concatenate([np.zeros(len(X_train_m6A), dtype=int), np.ones(len(X_train_m6A), dtype=int)])

# size
X_test.shape

(5443, 303)

In [4]:
# data set for unm long
X_unmlong3b = current[(current['sams'] == 'sams-3b') & (current['RNA'] == 'unm long')].iloc[:,length].values
X_unmlong3c = current[(current['sams'] == 'sams-3c') & (current['RNA'] == 'unm long')].iloc[:,length].values
X_unmlong4b = current[(current['sams'] == 'sams-4b') & (current['RNA'] == 'unm long')].iloc[:,length].values
X_unmlong4c = current[(current['sams'] == 'sams-4c') & (current['RNA'] == 'unm long')].iloc[:,length].values
X_unmlong4d = current[(current['sams'] == 'sams-4d') & (current['RNA'] == 'unm long')].iloc[:,length].values

In [5]:
# data set for vivo
X_vivo3b = current[(current['sams'] == 'sams-3b') & (current['RNA'] == 'vivo')].iloc[:,length].values
X_vivo3c = current[(current['sams'] == 'sams-3c') & (current['RNA'] == 'vivo')].iloc[:,length].values
X_vivo4b = current[(current['sams'] == 'sams-4b') & (current['RNA'] == 'vivo')].iloc[:,length].values
X_vivo4c = current[(current['sams'] == 'sams-4c') & (current['RNA'] == 'vivo')].iloc[:,length].values
X_vivo4d = current[(current['sams'] == 'sams-4d') & (current['RNA'] == 'vivo')].iloc[:,length].values
X_vivo5b = current[(current['sams'] == 'sams-5b') & (current['RNA'] == 'vivo')].iloc[:,length].values

# Load packages

In [13]:
import mkl
mkl.set_num_threads(10)

1

In [6]:
# load models
# Decision tree
from sklearn.tree import DecisionTreeClassifier
# Random forest
from sklearn.ensemble import RandomForestClassifier
# Logistic regression
from sklearn.linear_model import LogisticRegression
# KNN
from sklearn.neighbors import KNeighborsClassifier
# SVC
from sklearn.svm import SVC
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# GaussianNB
from sklearn.naive_bayes import GaussianNB
# LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# MLPClassifier
from sklearn.neural_network import MLPClassifier
# XGBoost
import xgboost as xgb
# LightGBM
import lightgbm as lgbm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Compare classifiers by tuned parameters

In [7]:
# Gradient boosting
name = 'GradientBoostingClassifier'
classifier = GradientBoostingClassifier
params = {'learning_rate' : hp.uniform('learning_rate', 0.01, 1),
          'max_depth': hp.choice('max_depth', range(1,20)),
          'min_samples_leaf': hp.choice('min_samples_leaf', range(1,20)),
          'max_features': hp.uniform('max_features', 0.01, 1)
         }

In [9]:
# XGBoost
name = 'XGBoost'
classifier = xgb.XGBClassifier
params = {
            'learning_rate':    hp.uniform('learning_rate', 0.01, 1),
            'max_depth':        hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),
            'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, 1, dtype=int)),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
            'subsample':        hp.uniform('subsample', 0.2, 1),
            'n_estimators':     100
}

In [18]:
# LightGBM
name = 'LightGBM'
classifier = lgbm.LGBMClassifier
params = {
            'learning_rate':    hp.uniform('learning_rate', 0.01, 1),
            'max_depth':        hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),
            'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, 1, dtype=int)),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
            'subsample':        hp.uniform('subsample', 0.2, 1),
            'n_estimators':     100
}

In [14]:
# Decision tree
name = 'DecisionTree'
classifier = DecisionTreeClassifier
params = {
            'max_depth': hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),
            'max_features': hp.choice('max_features', np.arange(1, 20, 1, dtype=int)),
            'min_samples_split': hp.choice('min_samples_split', np.arange(2, 20, 1, dtype=int)),
            'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 20, 1, dtype=int))
}

In [16]:
# Random forest
name = 'RandomForest'
classifier = RandomForestClassifier
params = {
            'max_depth': hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),
            'max_features': hp.choice('max_features', np.arange(1, 20, 1, dtype=int)),
            'min_samples_split': hp.choice('min_samples_split', np.arange(2, 20, 1, dtype=int)),
            'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 20, 1, dtype=int))
}

In [19]:
# fitting parameters
# file name
base_name = 'Hyperopt_' + name + '_m6A_Nanopore_current_100nt_sams-3b5b'

# function to minimize
def objective(args):
    clf = classifier(**args)
    clf.fit(X_train, Y_train)
    scoreTest = clf.score(X_test,Y_test)
    return -1*scoreTest

# save steps
trials = Trials()

# tuning
best = fmin(
    objective,
    params,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    verbose=1
)


# best params
clf = classifier(**space_eval(params, best))
clf.fit(X_train, Y_train)

# save model
with open(base_name + '.pickle', 'wb') as f:
    pickle.dump(clf, f)
    
    
# output scores
out_path = base_name + '.txt'
with open(out_path, mode='w') as f:

    
    # training set
    accuracyTrain = clf.score(X_train,Y_train)
    
    
    # test set
    accuracyTest = clf.score(X_test,Y_test)
    
    predictTestUnm = clf.predict(X_test[np.where(Y_test == 0)])
    scoreTestUnm = len(predictTestUnm[predictTestUnm == 1])/len(predictTestUnm)
    
    predictTestm6A = clf.predict(X_test[np.where(Y_test == 1)])
    scoreTestm6A = len(predictTestm6A[predictTestm6A == 1])/len(predictTestm6A)
    
    
    # unmodified long
    scoreUnmlong3b = np.count_nonzero(clf.predict(X_unmlong3b))/len(X_unmlong3b)
    scoreUnmlong3c = np.count_nonzero(clf.predict(X_unmlong3c))/len(X_unmlong3c)
    scoreUnmlong4b = np.count_nonzero(clf.predict(X_unmlong4b))/len(X_unmlong4b)
    scoreUnmlong4c = np.count_nonzero(clf.predict(X_unmlong4c))/len(X_unmlong4c)
    scoreUnmlong4d = np.count_nonzero(clf.predict(X_unmlong4d))/len(X_unmlong4d)


    # vivo
    scoreVivo3b = np.count_nonzero(clf.predict(X_vivo3b))/len(X_vivo3b)
    scoreVivo3c = np.count_nonzero(clf.predict(X_vivo3c))/len(X_vivo3c)
    scoreVivo4b = np.count_nonzero(clf.predict(X_vivo4b))/len(X_vivo4b)
    scoreVivo4c = np.count_nonzero(clf.predict(X_vivo4c))/len(X_vivo4c)
    scoreVivo4d = np.count_nonzero(clf.predict(X_vivo4d))/len(X_vivo4d)
    scoreVivo5b = np.count_nonzero(clf.predict(X_vivo5b))/len(X_vivo5b)
    
    
    # write results
    label = 'accuracyTrain,accuracyTest,scoreTestUnm,scoreTestm6A,scoreUnmlong3b,scoreUnmlong3c,'\
            'scoreUnmlong4b,scoreUnmlong4c,scoreUnmlong4d,scoreVivo3b,scoreVivo3c,scoreVivo4b,scoreVivo4c,scoreVivo4d,scoreVivo5b'
    
    f.write('Name,%s\n'\
            
            'Size,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'\
            
            'Score,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' 
            
            % (label,
               
               len(X_train), len(X_test), len(predictTestUnm), len(predictTestm6A),
               len(X_unmlong3b), len(X_unmlong3c), len(X_unmlong4b), len(X_unmlong4c), len(X_unmlong4d),
               len(X_vivo3b), len(X_vivo3c), len(X_vivo4b), len(X_vivo4c), len(X_vivo4d), len(X_vivo5b),
               
               accuracyTrain, accuracyTest, scoreTestUnm, scoreTestm6A,
               scoreUnmlong3b, scoreUnmlong3c, scoreUnmlong4b, scoreUnmlong4c, scoreUnmlong4d,
               scoreVivo3b, scoreVivo3c, scoreVivo4b, scoreVivo4c, scoreVivo4d, scoreVivo5b
            ))

# importance from Gradient boosting model
if hasattr(clf, 'feature_importances_'):
    np.savetxt('Importance_' + base_name + '.csv', clf.feature_importances_, delimiter=',')    


# Compare classifiers by tuned parameters and scaled features

In [20]:
# scaling
stdsc = StandardScaler()
X_train = stdsc.fit_transform(X_train)
X_test = stdsc.transform(X_test)

X_unmlong3b = stdsc.transform(X_unmlong3b)
X_unmlong3c = stdsc.transform(X_unmlong3c)
X_unmlong4b = stdsc.transform(X_unmlong4b)
X_unmlong4c = stdsc.transform(X_unmlong4c)
X_unmlong4d = stdsc.transform(X_unmlong4d)

X_vivo3b = stdsc.transform(X_vivo3b)
X_vivo3c = stdsc.transform(X_vivo3c)
X_vivo4b = stdsc.transform(X_vivo4b)
X_vivo4c = stdsc.transform(X_vivo4c)
X_vivo4d = stdsc.transform(X_vivo4d)
X_vivo5b = stdsc.transform(X_vivo5b)

In [21]:
# SVM # scaling
name = 'SVM'
classifier = SVC
params = {'C':hp.loguniform('C', -6, 2),
          'gamma': hp.loguniform('gamma', -6, 2),
          'kernel': hp.choice('kernel', ['linear', 'rbf', 'poly']),
          'cache_size': 10000
         }

In [23]:
# Logistic regression # scaling
name = 'LogisticRegression'
classifier = LogisticRegression
params = {
            'C': hp.uniform('C', 0.00001, 1000),
            'random_state': hp.choice('random_state', np.arange(1, 100, 1, dtype=int))
}

In [25]:
# KNN # scaling
name = 'KNeighbors'
classifier = KNeighborsClassifier
params = {
            'weights': hp.choice('weights', ['uniform','distance']),
            'leaf_size': hp.choice('leaf_size', np.arange(5, 50, 5, dtype=int)),
            'n_neighbors': hp.choice('n_neighbors', np.arange(1, 30, 1, dtype=int)),
            'p': hp.choice('p', np.arange(1, 3, 1, dtype=int))
}

In [27]:
# MLPClassifier # scaling
name = 'MLP'
classifier = MLPClassifier
params = {
            'alpha': hp.loguniform('alpha', np.log(0.0001), np.log(0.9)),
            'hidden_layer_sizes': hp.choice('hidden_layer_sizes', np.arange(100, 1000, 50, dtype=int)),
            'learning_rate': hp.choice('learning_rate', ['constant','adaptive']),
            'activation': 'relu',
            'solver': 'adam'
}
            

In [28]:
# fitting parameters
# file name
base_name = 'Hyperopt_' + name + '_m6A_Nanopore_current_100nt_sams-3b5b'

# function to minimize
def objective(args):
    clf = classifier(**args)
    clf.fit(X_train, Y_train)
    scoreTest = clf.score(X_test,Y_test)
    return -1*scoreTest

# save steps
trials = Trials()

# tuning
best = fmin(
    objective,
    params,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    verbose=1
)


# best params
clf = classifier(**space_eval(params, best))
clf.fit(X_train, Y_train)

# save model
with open(base_name + '.pickle', 'wb') as f:
    pickle.dump(clf, f)
    
    
# output scores
out_path = base_name + '.txt'
with open(out_path, mode='w') as f:

    
    # training set
    accuracyTrain = clf.score(X_train,Y_train)
    
    
    # test set
    accuracyTest = clf.score(X_test,Y_test)
    
    predictTestUnm = clf.predict(X_test[np.where(Y_test == 0)])
    scoreTestUnm = len(predictTestUnm[predictTestUnm == 1])/len(predictTestUnm)
    
    predictTestm6A = clf.predict(X_test[np.where(Y_test == 1)])
    scoreTestm6A = len(predictTestm6A[predictTestm6A == 1])/len(predictTestm6A)
    
    
    # unmodified long
    scoreUnmlong3b = np.count_nonzero(clf.predict(X_unmlong3b))/len(X_unmlong3b)
    scoreUnmlong3c = np.count_nonzero(clf.predict(X_unmlong3c))/len(X_unmlong3c)
    scoreUnmlong4b = np.count_nonzero(clf.predict(X_unmlong4b))/len(X_unmlong4b)
    scoreUnmlong4c = np.count_nonzero(clf.predict(X_unmlong4c))/len(X_unmlong4c)
    scoreUnmlong4d = np.count_nonzero(clf.predict(X_unmlong4d))/len(X_unmlong4d)


    # vivo
    scoreVivo3b = np.count_nonzero(clf.predict(X_vivo3b))/len(X_vivo3b)
    scoreVivo3c = np.count_nonzero(clf.predict(X_vivo3c))/len(X_vivo3c)
    scoreVivo4b = np.count_nonzero(clf.predict(X_vivo4b))/len(X_vivo4b)
    scoreVivo4c = np.count_nonzero(clf.predict(X_vivo4c))/len(X_vivo4c)
    scoreVivo4d = np.count_nonzero(clf.predict(X_vivo4d))/len(X_vivo4d)
    scoreVivo5b = np.count_nonzero(clf.predict(X_vivo5b))/len(X_vivo5b)
    
    
    # write results
    label = 'accuracyTrain,accuracyTest,scoreTestUnm,scoreTestm6A,scoreUnmlong3b,scoreUnmlong3c,'\
            'scoreUnmlong4b,scoreUnmlong4c,scoreUnmlong4d,scoreVivo3b,scoreVivo3c,scoreVivo4b,scoreVivo4c,scoreVivo4d,scoreVivo5b'
    
    f.write('Name,%s\n'\
            
            'Size,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'\
            
            'Score,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' 
            
            % (label,
               
               len(X_train), len(X_test), len(predictTestUnm), len(predictTestm6A),
               len(X_unmlong3b), len(X_unmlong3c), len(X_unmlong4b), len(X_unmlong4c), len(X_unmlong4d),
               len(X_vivo3b), len(X_vivo3c), len(X_vivo4b), len(X_vivo4c), len(X_vivo4d), len(X_vivo5b),
               
               accuracyTrain, accuracyTest, scoreTestUnm, scoreTestm6A,
               scoreUnmlong3b, scoreUnmlong3c, scoreUnmlong4b, scoreUnmlong4c, scoreUnmlong4d,
               scoreVivo3b, scoreVivo3c, scoreVivo4b, scoreVivo4c, scoreVivo4d, scoreVivo5b
            ))

# importance from Gradient boosting model
if hasattr(clf, 'feature_importances_'):
    np.savetxt('Importance_' + base_name + '.csv', clf.feature_importances_, delimiter=',')    


# Compare classifiers by default parameters and scaled features

In [29]:
# set classifiers
names = [
        'Decision Tree',
        'Random Forest', 
        'Logistic Regression',
        'K-Nearest Neighbor',
        'SVM',
        'Adaptive Boosting',
        'Gradient Boosting',
        'Gaussian Naive Bayes',
        'Linear Discriminant Analysis',
        'Quadratic Discriminant Analysis',
        'Multilayer Perceptron',
        'XGBoost',
        'LightGBM'
]



classifiers = [
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                LogisticRegression(),
                KNeighborsClassifier(),
                SVC(),
                AdaBoostClassifier(),
                GradientBoostingClassifier(),
                GaussianNB(),
                LinearDiscriminantAnalysis(),
                QuadraticDiscriminantAnalysis(),
                MLPClassifier(),
                xgb.XGBClassifier(),
                lgbm.LGBMClassifier()
]

In [30]:
# compare classifiers
out_path = 'sklearn_compareClassifiers_m6A_Nanopore_current_sams-3b5b_100nt.txt'
with open(out_path, mode='w') as f:
    

    # loop classifiers
    for name, clf in zip(names, classifiers):
        
        
        # fitting
        clf = clf.fit(X_train,Y_train)
        
        
        # training set
        accuracyTrain = clf.score(X_train,Y_train)


        # test set
        accuracyTest = clf.score(X_test,Y_test)

        predictTestUnm = clf.predict(X_test[np.where(Y_test == 0)])
        scoreTestUnm = len(predictTestUnm[predictTestUnm == 1])/len(predictTestUnm)

        predictTestm6A = clf.predict(X_test[np.where(Y_test == 1)])
        scoreTestm6A = len(predictTestm6A[predictTestm6A == 1])/len(predictTestm6A)


        # unmodified long
        scoreUnmlong3b = np.count_nonzero(clf.predict(X_unmlong3b))/len(X_unmlong3b)
        scoreUnmlong3c = np.count_nonzero(clf.predict(X_unmlong3c))/len(X_unmlong3c)
        scoreUnmlong4b = np.count_nonzero(clf.predict(X_unmlong4b))/len(X_unmlong4b)
        scoreUnmlong4c = np.count_nonzero(clf.predict(X_unmlong4c))/len(X_unmlong4c)
        scoreUnmlong4d = np.count_nonzero(clf.predict(X_unmlong4d))/len(X_unmlong4d)


        # vivo
        scoreVivo3b = np.count_nonzero(clf.predict(X_vivo3b))/len(X_vivo3b)
        scoreVivo3c = np.count_nonzero(clf.predict(X_vivo3c))/len(X_vivo3c)
        scoreVivo4b = np.count_nonzero(clf.predict(X_vivo4b))/len(X_vivo4b)
        scoreVivo4c = np.count_nonzero(clf.predict(X_vivo4c))/len(X_vivo4c)
        scoreVivo4d = np.count_nonzero(clf.predict(X_vivo4d))/len(X_vivo4d)
        scoreVivo5b = np.count_nonzero(clf.predict(X_vivo5b))/len(X_vivo5b)


        # write results
        f.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' 
                % (name, accuracyTrain, accuracyTest, scoreTestUnm, scoreTestm6A,
                   scoreUnmlong3b, scoreUnmlong3c, scoreUnmlong4b, scoreUnmlong4c, scoreUnmlong4d,
                   scoreVivo3b, scoreVivo3c, scoreVivo4b, scoreVivo4c, scoreVivo4d, scoreVivo5b))
        
        # tracking
        print(name + " done")

Decision Tree done




Random Forest done




Logistic Regression done
K-Nearest Neighbor done
SVM done
Adaptive Boosting done
Gradient Boosting done
Gaussian Naive Bayes done
Linear Discriminant Analysis done
Quadratic Discriminant Analysis done
Multilayer Perceptron done
XGBoost done
LightGBM done


In [None]:
# Downsampling

In [40]:
# Gradient boosting
name = 'GradientBoostingClassifier'
classifier = GradientBoostingClassifier
params = {'learning_rate' : hp.uniform('learning_rate', 0.01, 1),
          'max_depth': hp.choice('max_depth', range(1,20)),
          'min_samples_leaf': hp.choice('min_samples_leaf', range(1,20)),
          'max_features': hp.uniform('max_features', 0.01, 1)
         }

In [41]:
# downsampling
X_train_m6A = X_train[np.where(Y_train == 1)]
X_train_unm = X_train[np.where(Y_train == 0)]
X_train_unm = X_train_unm[np.random.choice(len(X_train_unm), len(X_train_m6A), replace=False)]
X_train = np.concatenate([X_train_unm, X_train_m6A])
Y_train = np.concatenate([np.zeros(len(X_train_m6A), dtype=int), np.ones(len(X_train_m6A), dtype=int)])


# function to minimize
def objective(args):
    clf = classifier(**args)
    clf.fit(X_train_down, Y_train_down)
    scoreTest = clf.score(X_test,Y_test)
    return -1*scoreTest

# save steps
trials = Trials()

# tuning
best = fmin(
    objective,
    params,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    verbose=1
)


# best params
clf = classifier(**space_eval(params, best))
clf.fit(X_train_down, Y_train_down)

# save model
with open(base_name + '.pickle', 'wb') as f:
    pickle.dump(clf, f)
    
    
# output scores
out_path = base_name + '.txt'
with open(out_path, mode='w') as f:

    
    # training set
    accuracyTrain = clf.score(X_train_down, Y_train_down)
    
    
    # test set
    accuracyTest = clf.score(X_test,Y_test)
    
    predictTestUnm = clf.predict(X_test[np.where(Y_test == 0)])
    scoreTestUnm = len(predictTestUnm[predictTestUnm == 1])/len(predictTestUnm)
    
    predictTestm6A = clf.predict(X_test[np.where(Y_test == 1)])
    scoreTestm6A = len(predictTestm6A[predictTestm6A == 1])/len(predictTestm6A)
    
    
    # unmodified long
    scoreUnmlong3b = np.count_nonzero(clf.predict(X_unmlong3b))/len(X_unmlong3b)
    scoreUnmlong3c = np.count_nonzero(clf.predict(X_unmlong3c))/len(X_unmlong3c)
    scoreUnmlong4b = np.count_nonzero(clf.predict(X_unmlong4b))/len(X_unmlong4b)
    scoreUnmlong4c = np.count_nonzero(clf.predict(X_unmlong4c))/len(X_unmlong4c)
    scoreUnmlong4d = np.count_nonzero(clf.predict(X_unmlong4d))/len(X_unmlong4d)


    # vivo
    scoreVivo3b = np.count_nonzero(clf.predict(X_vivo3b))/len(X_vivo3b)
    scoreVivo3c = np.count_nonzero(clf.predict(X_vivo3c))/len(X_vivo3c)
    scoreVivo4b = np.count_nonzero(clf.predict(X_vivo4b))/len(X_vivo4b)
    scoreVivo4c = np.count_nonzero(clf.predict(X_vivo4c))/len(X_vivo4c)
    scoreVivo4d = np.count_nonzero(clf.predict(X_vivo4d))/len(X_vivo4d)
    scoreVivo5b = np.count_nonzero(clf.predict(X_vivo5b))/len(X_vivo5b)
    
    
    # write results
    label = 'accuracyTrain,accuracyTest,scoreTestUnm,scoreTestm6A,scoreUnmlong3b,scoreUnmlong3c,'\
            'scoreUnmlong4b,scoreUnmlong4c,scoreUnmlong4d,scoreVivo3b,scoreVivo3c,scoreVivo4b,scoreVivo4c,scoreVivo4d,scoreVivo5b'
    
    f.write('Name,%s\n'\
            
            'Size,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'\
            
            'Score,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' 
            
            % (label,
               
               len(X_train_down), len(X_test), len(predictTestUnm), len(predictTestm6A),
               len(X_unmlong3b), len(X_unmlong3c), len(X_unmlong4b), len(X_unmlong4c), len(X_unmlong4d),
               len(X_vivo3b), len(X_vivo3c), len(X_vivo4b), len(X_vivo4c), len(X_vivo4d), len(X_vivo5b),
               
               accuracyTrain, accuracyTest, scoreTestUnm, scoreTestm6A,
               scoreUnmlong3b, scoreUnmlong3c, scoreUnmlong4b, scoreUnmlong4c, scoreUnmlong4d,
               scoreVivo3b, scoreVivo3c, scoreVivo4b, scoreVivo4c, scoreVivo4d, scoreVivo5b
            ))

# importance from Gradient boosting model
if hasattr(clf, 'feature_importances_'):
    np.savetxt('Importance_' + base_name + '.csv', clf.feature_importances_, delimiter=',')    
