In [None]:
from ctc_utils.functions import def_gen_expr_for_datasets, def_labels_for_datasets, load_data, prune_training_set, loopForDLR, unsupervisedClusteringFaiss
from sklearn.feature_selection import VarianceThreshold
from classifiers.DTC import dtc
from classifiers.LogisticRegression import logreg
from classifiers.LinearSVC import lsvc
from classifiers.kNN import knn
from classifiers.MLP import mlp
from classifiers.NB import gnb
from classifiers.SDGC import sdg
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from collections import  Counter
import numpy as np
from imblearn.over_sampling import RandomOverSampler



# Combine datasets from different protocols

Prepare training and test sets

In [None]:
# Load data
cell_names, gene_names, labels, gene_expr_bin = load_data()

#Training datasets
training_datasets = [gene_expr_bin[6444:23153],
 np.concatenate((gene_expr_bin[0:6443], gene_expr_bin[6697:23153])), 
 np.concatenate((gene_expr_bin[0:6696], gene_expr_bin[9919:23153])),
 np.concatenate((gene_expr_bin[0:9918], gene_expr_bin[10172:23153])),
 np.concatenate((gene_expr_bin[0:10171], gene_expr_bin[13394:23153])), 
 np.concatenate((gene_expr_bin[0:13393], gene_expr_bin[16616:23153])),
 np.concatenate((gene_expr_bin[0:16615], gene_expr_bin[19792:23153])),
 gene_expr_bin[0:19791]    
 ]

# Test Datasets
# [ge_10xv2, ge_SM2, ge_10xv3, ge_CL, ge_DR, ge_iD, ge_SW, ge_10xv2_2]
test_datasets = def_gen_expr_for_datasets(gene_expr_bin)


labels_for__training_datasets = [labels[6444:23153],
 np.concatenate((labels[0:6443], labels[6697:23153])), 
 np.concatenate((labels[0:6696], labels[9919:23153])),
 np.concatenate((labels[0:9918], labels[10172:23153])),
 np.concatenate((labels[0:10171], labels[13394:23153])), 
 np.concatenate((labels[0:13393], labels[16616:23153])),
 np.concatenate((labels[0:16615], labels[19792:23153])),
 labels[0:19791]    
 ]

# Labels for test datasets
# [lb_10xv2, lb_SM2, lb_10xv3, lb_CL, lb_DR, lb_iD, lb_SW, lb_10xv2_2]
labels_for_test_datasets = def_labels_for_datasets(labels)


acc_matrix = np.zeros(7)

all_matrix = np.zeros(shape = (8,7))

# Classification

Uncommend commented part to balance the class imbalance.

In [None]:
for i, x_train in enumerate(training_datasets):
    
    y_train = labels_for__training_datasets[i]
    x_test = test_datasets[i]
    y_test = labels_for_test_datasets[i]

    ###########################################################################################
    # Fix imbalance
    # Find out the majority class and its occurances
    # d = Counter(y_train)
    # max_occurances = max(d, key=d. get)
    # max_num = d.get(max_occurances)
    # half = int(max_num/2)

    # Eliminate all classes to the hald of the majority class
    # x_train, y_train = prune_training_set(x_train, y_train, 3158)

    # Feature selection
    sel = VarianceThreshold(threshold=0.16)
    x_train = sel.fit_transform(x_train)
    x_test = sel.transform(x_test)

    # unsupervised clustering using faiss 
    neighbors_index_ps, neighbors_index_ps_test, x_train, x_test = unsupervisedClusteringFaiss(x_train, x_test, 16)
    # calculate DLR for each cell sample for training of the model.
    x_train = loopForDLR(neighbors_index_ps, x_train)
    # calculate DLR for each cell sample for testing the  model.
    x_test = loopForDLR(neighbors_index_ps_test, x_train)

    #  # Apply oversampling
    # over_sampling = RandomOverSampler(sampling_strategy = "not majority")
    # # fit and apply the transform
    # x_train, y_train = over_sampling.fit_resample(x_train, y_train)

   ############################################################################################### 
    print("classification...")    

    #Apply knn
    model, acc_knn = knn(x_train, y_train, x_test,y_test)

    #Apply logistic regression
    model, acc_logreg = logreg(x_train, y_train, x_test, y_test)

    #Apply Decision Tree Classifier
    model, acc_dtc = dtc(x_train, y_train, x_test, y_test)

    #Apply LinearSVC Classifier
    model, acc_lsvc = lsvc(x_train, y_train, x_test, y_test)

    #Apply MLP
    model, acc_mlp = mlp(x_train, y_train, x_test, y_test)

    #Apply GaussianNB
    model, acc_gnb = gnb(x_train, y_train, x_test, y_test)

    #Apply SDGC
    model, acc_sdg = sdg(x_train, y_train, x_test, y_test)
    #contains_all accuracy
    all_acc_within_dataset = np.array([acc_knn, acc_logreg, acc_dtc, acc_lsvc, acc_mlp, acc_gnb, acc_sdg])

    acc_matrix = all_acc_within_dataset

    all_matrix[i] = acc_matrix
    print(all_matrix)

# Save results matrix to disk

In [None]:
np.save("../results/performance_matrices/ctc_dlr_non_balanced", all_matrix)

# Load results matrix from disk

In [None]:
all_matrix = np.load("../results/performance_matrices/ctc_dlr_non_balanced.npy")

# Plotting

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


y_labels = ["excl_10xv2", "excl_SM2", "excl_10xv3", "excl_CL", "excl_DR", "excl_iD", "excl_SW", "excl_10xv2_2"]
x_labels = ["knn", "Logistic Regression", "DTC", "LinearSVC", "MLP", "GaussianNB", "SDG" ]

ax = sns.heatmap(all_matrix, annot=True, fmt=".2f")  
# labels, title and ticks
ax.xaxis.set_ticklabels(x_labels, rotation = 90, fontsize = 6)
ax.yaxis.set_ticklabels(y_labels, rotation = 0)

plt.savefig("../report_pdf_results/ctc_dlr_non_balanced.pdf", format='pdf', bbox_inches='tight')


In [None]:
plt.boxplot(all_matrix, labels = x_labels)
plt.xticks(rotation=90)

In [None]:
plt.boxplot(all_matrix.T, labels = y_labels)
plt.xticks(rotation=90)
