In [33]:
import sys
from multiprocessing import Manager, Lock, Pool, cpu_count
import time

import numpy as np
import pandas as pd
import umap

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV

from bokeh.io import show
from bokeh.palettes import Category20
from bokeh.plotting import figure
from bokeh.io import export_svgs
from bokeh.io import output_notebook
from bokeh.resources import CDN
output_notebook()

In [34]:
def progress(count, total, status=''):
    
    bar_len = 40
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '█' * filled_len + '░' * (bar_len - filled_len)

    sys.stdout.write(f'\r|{bar}| {percents}% ... {status}')
    sys.stdout.flush()

In [35]:
INDEX_COLUMN_NAME = 'sample'
TUMOR_COLUMN_NAME = 'tumor'
HGG_ROW_VALUE = 'hgg'
LGG_ROW_VALUE = 'lgg'
HGG_CSV = '/home/group2/marco/HGG_even_more_ficiurz.csv'
LGG_CSV = '/home/group2/marco/LGG_lot_of_nasty_ficiurz.csv'

In [36]:
hgg = pd.read_csv(filepath_or_buffer=HGG_CSV)
lgg = pd.read_csv(filepath_or_buffer=LGG_CSV)
hgg = hgg.set_index(['sample'])
lgg = lgg.set_index(['sample'])
surv_hgg = hgg.loc[:, 'surv']
surv_lgg = lgg.loc[:, 'surv']
tot_cols = len(list(hgg))
hgg = hgg.iloc[:, 1:(tot_cols-2)]
lgg = lgg.iloc[:, 1:(tot_cols-2)]

In [37]:
tumor_hgg = ['hgg' for _ in range(hgg.shape[0])]
tumor_lgg = ['lgg' for _ in range(lgg.shape[0])]
hgg = hgg.assign(tumor=tumor_hgg)
lgg = lgg.assign(tumor=tumor_lgg)

In [38]:
alldata = pd.concat([hgg,lgg])
alldata = alldata.replace(np.inf, np.nan)
alldata = alldata.dropna(axis=1)
print(alldata.shape)

(267, 5521)


In [127]:
def calc_mcc(exps, test_size, estimator_cls, **kwargs):
    for exp in exps:
        x_train, x_test, y_train, y_test = train_test_split(alldata.iloc[:, :alldata.shape[1]-1], 
                                                            alldata['tumor'],
                                                            test_size=test_size,
                                                            random_state=exp,
                                                            stratify=alldata['tumor'])
        x_train, x_test = (x_train - x_train.mean(axis=0)) / x_train.var(axis=0), (x_test - x_train.mean(axis=0)) / x_test.var(axis=0)
        classifier = estimator_cls(**kwargs)
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        with Lock():
            mccs_man.append(matthews_corrcoef(y_test, y_pred))
#             features_man.append(classifier.feature_importances_)
            features_man.append(classifier.coef_.ravel())
            count.value += 1
            progress(count.value, num_exps, status='({}) Calculating MCCs'.format(estimator_cls.__name__))

In [128]:
def segment(array, parts):
    '''
    Function to segment the array in order to utilize multiprocessing
    '''
    
    avg = len(array) / parts
    last = 0.0

    while last < len(array):
        yield array[int(last):int(last + avg)]
        last += avg

In [129]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [130]:
ESTIMATOR_CLASS = LinearSVC

In [132]:
num_exps = 1000
test_size = 0.2

with Manager() as manager:
    # List of all features
    mccs_man = manager.list()
    features_man = manager.list()
    
    count = manager.Value('i', 0)
    
    t0 = time.time()

    with Pool(cpu_count()) as pool:
        tasks = [pool.apply_async(calc_mcc, args=(part, test_size, ESTIMATOR_CLASS), kwds={'C' : 1e-3}) 
                 for part in segment([*range(num_exps)], cpu_count())]
        [task.get() for task in tasks]
        mccs = [x for x in mccs_man]
        top_features = [x for x in features_man]

    print(f' ... {round(time.time() - t0, 2)}s')

pd.DataFrame(mccs).describe()

|████████████████████████████████████████| 99.8% ... (LinearSVC) Calculating MCCs ... 87.05s


Unnamed: 0,0
count,1000.0
mean,0.732013
std,0.209166
min,-0.030083
25%,0.610218
50%,0.808608
75%,0.877058
max,1.0


In [44]:
from bokeh.models import ColumnDataSource
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
from bokeh.models.widgets import Slider
from bokeh.models.callbacks import CustomJS

In [121]:
#params
n_neighbors = 15
min_dist = 0.001
n_components = 2
metric = 'manhattan'
mapping = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=12345)

embedding = mapping.fit_transform(alldata.iloc[:, 1:5520])

thecols = alldata['tumor']

thecols = pd.DataFrame(thecols).replace('hgg', 'blue')
thecols = pd.DataFrame(thecols).replace('lgg', 'yellow')
source = ColumnDataSource(dict(x=embedding[:, 0], y= embedding[:, 1], colors= list(thecols.tumor)))


# thecols = [choose_cols(s) for s in surv_hgg]
p = figure(plot_width=800, plot_height=600, toolbar_location='left')
p.circle(x='x', y='y', size=10, color='colors', source=source)
show(p)

In [143]:
importances = top_features
print(len(importances[0]))
tops = [[0, i] for i in range(5520)]

5520


In [144]:
for importance in importances:
    indices = np.argsort(importance)[::-1]
    for f in range(5520):
        tops[indices[f]][0] += 5520 - f

In [145]:
tops = [*sorted(tops, key=lambda el: el[0], reverse=True)][:10]

In [125]:
for couple in tops:
    print(alldata.columns[couple[1]])

wavelet-HLL_gldm_DependenceNonUniformityNormalized_flair
wavelet-LLL_gldm_LargeDependenceEmphasis_flair
wavelet-HLL_gldm_LargeDependenceEmphasis_flair
wavelet-HLL_gldm_DependenceNonUniformityNormalized_t2
wavelet-HLL_glrlm_RunPercentage_flair
wavelet-HLL_glrlm_RunLengthNonUniformityNormalized_flair
wavelet-HLL_gldm_DependenceVariance_flair
wavelet-LLL_glrlm_LongRunEmphasis_flair
wavelet-LLL_glrlm_ShortRunEmphasis_flair
wavelet-LLL_glrlm_RunPercentage_flair


In [146]:
for couple in tops:
    print(alldata.columns[couple[1]])

wavelet-LLL_glszm_GrayLevelNonUniformityNormalized_t2
wavelet-LLL_glszm_GrayLevelNonUniformityNormalized_flair
wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis_t1
wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis_t1ce
wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis_flair
log-sigma-1-0-mm-3D_glcm_Idmn_flair
wavelet-LLL_glrlm_ShortRunLowGrayLevelEmphasis_t1
wavelet-LLL_glszm_LowGrayLevelZoneEmphasis_flair
wavelet-HLL_glrlm_ShortRunLowGrayLevelEmphasis_flair
wavelet-HLL_glszm_SmallAreaLowGrayLevelEmphasis_flair
