# Load stored data

First, set the names of the pickled files to load:

In [None]:
cell_names_file_name = "../data/cell_names"
gene_names_file_name = "../data/gene_names"
labels_file_name = "../data/labels"
gene_expr_bin_bitpacked_file_name = "../data/gene_expr_bin_bitpacked"

Then, actually load the data in those files

In [None]:
from ctc_utils.functions import load_pickled_cells_data

(cell_names, gene_names, labels, gene_expr_bin) = load_pickled_cells_data(
    cell_names_file_name,
    gene_names_file_name,
    labels_file_name,
    gene_expr_bin_bitpacked_file_name + ".npy",
)

# Partition into seperate datasets

Define gene expressions per dataset

In [None]:
from ctc_utils.functions import def_gen_expr_for_datasets, def_labels_for_datasets

ge_10xv2, ge_SM2, ge_10xv3, ge_CL, ge_DR, ge_iD, ge_SW, ge_10xv2_2 = def_gen_expr_for_datasets(gene_expr_bin)

Define labels per dataset

In [None]:
lb_10xv2, lb_SM2, lb_10xv3, lb_CL, lb_DR, lb_iD, lb_SW, lb_10xv2_2 = def_labels_for_datasets(labels)

# Choose Datasets

In [None]:
# Define dataset to be used
x_dt = ge_SW
y_dt = lb_SW

# In case you want to use  diffrent set for testing and for training
x_dt_test = ge_iD
y_dt_test = lb_iD

# Apply unsupervised clustering

In [None]:
import numpy as np
from ctc_utils.functions import loopForDLR, unsupervisedClusteringFaiss
from sklearn.feature_selection import VarianceThreshold

#Feature selection
sel = VarianceThreshold(threshold=0.16)
x_dt_fs = sel.fit_transform(x_dt)
x_dt_test_fs = sel.transform(x_dt_test)

# unsupervised clustering using faiss 
neighbors_index_ps, neighbors_index_ps_test, x_dt_fs, x_dt_test_fs = unsupervisedClusteringFaiss(x_dt_fs, x_dt_test_fs, 8)
# calculate DLR for each cell sample for training of the model.
dlr_matrix = loopForDLR(neighbors_index_ps, x_dt_fs)
# calculate DLR for each cell sample for testing the  model.
dlr_matrix_test = loopForDLR(neighbors_index_ps_test, x_dt_fs)



# Define datasets for training and testing

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    dlr_matrix, y_dt, test_size=0.2, random_state=42
)

# (Optional) Test with different datasets

Comment out, if you want to evaluate performance on the selected dataset for training (partitioning).

In [None]:
x_train = dlr_matrix
y_train = y_dt
x_test = dlr_matrix_test
y_test = y_dt_test

# Oversampling

(if x_over and y_over are not used when fitting the model, oversampling is not used).

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


# over = RandomOverSampler(sampling_strategy=0.1)
# x_over, y_over = over.fit_resample(x_train, y_train)
# under = RandomUnderSampler(sampling_strategy=0.5)
# x_over, y_over = under.fit_resample(x_over, y_over)


# Apply KNN

In [None]:
from classifiers.kNN import knn
from sklearn .metrics import classification_report 
from sklearn.metrics  import f1_score


model, acc = knn(x_train, y_train, x_test,y_test)

from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

#plot confusion matrix
plot_confusion_matrix(model, x_test, y_test, xticks_rotation = "vertical", normalize = "true", values_format = ".2f")  
plt.show()

print (classification_report(model.predict(x_test), y_test))


# Apply Logistic Regression

In [None]:
from classifiers.LogisticRegression import logreg

model, acc = logreg(x_train, y_train, x_test, y_test)

#plot confusion matrix
plot_confusion_matrix(model, x_test, y_test, xticks_rotation = "vertical", normalize = "true", values_format = ".2f")  
plt.show()

print(model.score(x_test, y_test))
print(f1_score(model.predict(x_test), y_test, average= "weighted"))


# Apply Decision Tree Class classifier

In [None]:
from classifiers.DTC import dtc
from sklearn.metrics  import f1_score

model, acc = dtc(x_train, y_train, x_test, y_test)

#plot confusion matrix
plot_confusion_matrix(model, x_test, y_test, xticks_rotation = "vertical", normalize = "true", values_format = ".2f")  
plt.show()


print(model.score(x_test, y_test))
print(f1_score(model.predict(x_test), y_test, average= "weighted"))



In [None]:
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

steps = [('svd', TruncatedSVD(n_components=65)), ('m', svm.LinearSVC())]
model = Pipeline(steps=steps)

model.fit(x_train, y_train)

# Print accuracy
print(model.score(x_test, y_test))
print(f1_score(model.predict(x_test), y_test, average= "weighted"))

plot_confusion_matrix(model, x_test, y_test, xticks_rotation = "vertical", normalize = "true", values_format = ".2f")  
plt.show()


In [None]:
from sklearn.neural_network import MLPClassifier


steps = [('svd',PCA(n_components=65)), ('m', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1))]
model = Pipeline(steps=steps)

model.fit(x_train, y_train)

# Print accuracy
print(model.score(x_test, y_test))
print(f1_score(model.predict(x_test), y_test, average= "weighted"))

plot_confusion_matrix(model, x_test, y_test, xticks_rotation = "vertical", normalize = "true", values_format = ".2f")  
plt.show()