In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from utils import evaluate_embeddings
from utils import mean_average_precision
from utils import get_cell_type_map

# Load data

In [None]:
exp_train = pd.read_csv("/work/alexsong/project/single_cell_analysis/data/ath/machine_learning/expression_train.csv", index_col = 0, header = None)
exp_test = pd.read_csv("/work/alexsong/project/single_cell_analysis/data/ath/machine_learning/expression_test.csv", index_col = 0, header = None)
meta_train = pd.read_csv("/work/alexsong/project/single_cell_analysis/data/ath/machine_learning/meta_data_train.csv", index_col = 0, header = 0)
meta_test = pd.read_csv("/work/alexsong/project/single_cell_analysis/data/ath/machine_learning/meta_data_test.csv", index_col = 0, header = 0)

In [None]:
features_train = exp_train.values.T
features_test = exp_test.values.T
label_names = meta_train.loc[:,"cell_type"].unique()
names_to_id = { key:val for val,key in enumerate(label_names)}
labels_train = meta_train.loc[:,"cell_type"].replace(names_to_id).values
labels_test = meta_test.loc[:,"cell_type"].replace(names_to_id).values

# Train classsifer

In [None]:
clf_svm = SVC(probability=True).fit(features_train, labels_train)
clf_rf = RandomForestClassifier(n_estimators = 500, n_jobs = 42).fit(features_train, labels_train)
clf_knn = KNeighborsClassifier(n_neighbors = 50, n_jobs = 42).fit(features_train, labels_train)
clf_pca = PCA(n_components=100).fit(features_train)

# Predict cell types / embeddings

In [None]:
clfs = [clf_svm, clf_rf, clf_knn]
labels_pred = []
prob_pred = []

for clf in clfs:
    prob_pred_all = clf.predict_proba(features_test)
    labels_pred.append(np.argsort(-prob_pred_all,axis = 1)[:,0])
    prob_pred.append(np.array([prob_pred_all[i,labels_pred[-1][i]] for i in range(labels_pred[-1].shape[0])]))

pca_train_embbedings = clf_pca.fit_transform(features_train)
pca_test_embbedings = clf_pca.fit_transform(features_test)

# Evaluate predicted cell types / embbedings

In [None]:
acc = []
overall_map = []
cell_type_map = []
for y_pred, y_prob_pred, y_test in zip(labels_pred, prob_pred, np.tile(labels_test,(3,1))):
    acc.append(accuracy_score(y_test, y_pred))
    overall_map.append(mean_average_precision(y_test, y_pred, y_prob_pred))
    cell_type_map.append(get_cell_type_map(label_names.shape[0], y_test, y_pred, y_prob_pred))

In [None]:
y_prob_pred

In [27]:
pca_acc, pca_overall_map, pca_cell_type_map = evaluate_embeddings(pca_train_embbedings, pca_test_embbedings, labels_train, labels_test, k = 50, n_cell_types = label_names.shape[0])

In [28]:
pca_acc

0.4058333333333333

In [29]:
pca_overall_map

0.6180454613268721

In [30]:
pca_cell_type_map

array([0.21323967, 0.83016172, 0.62148272, 0.02641477, 0.00138518,
       0.71543177, 0.06871858, 0.74733573, 0.96045126, 0.90009247,
       0.04049868, 0.58664786])