# Figure 5B Performance Bar Graph

In [None]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.impute
import sklearn.ensemble
import scipy.stats
import matplotlib as mpl
import seaborn as sns
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

mpl.rcParams['figure.dpi'] = 250
pd.set_option('display.max_rows', 50)

#load dataset acquired by performing gene set variation analysis of each cell within the 68k PBMC dataset
gsets_df = pd.read_csv("/.mounts/labs/reimandlab/private/users/mbayati/MBP1413/batch job gsva/ES_var_C7_68k.tsv", sep="\t", header=0)

In [None]:
#split data from class labels.
X = gsets_df.iloc[:,0:3006]
Y = gsets_df.cell_type

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8)

In [None]:
logistic_performance = {"precision": [],"recall" : [], "f1" : [], "accuracy" : []}
logistic = LogisticRegression(solver='sag', max_iter=1000, multi_class = "multinomial")
logistic.fit(X_train,Y_train)
logistic_pred = logistic.predict(X_test)
logistic_performance["precision"].append(precision_score(Y_test, logistic_pred, average="macro"))
logistic_performance["recall"].append(recall_score(Y_test, logistic_pred, average="macro"))
logistic_performance["f1"].append(f1_score(Y_test, logistic_pred, average="macro"))
logistic_performance["accuracy"].append(accuracy_score(Y_test,logistic_pred))
print(logistic_performance)

In [None]:
MLP_performance = {"precision": [],"recall" : [], "f1" : [], "accuracy" : []}
MLP = MLPClassifier()
MLP.fit(X_train,Y_train)
mlp_pred = MLP.predict(X_test)
MLP_performance["precision"].append(precision_score(Y_test, mlp_pred, average="macro"))
MLP_performance["recall"].append(recall_score(Y_test, mlp_pred, average="macro"))
MLP_performance["f1"].append(f1_score(Y_test, mlp_pred, average="macro"))
MLP_performance["accuracy"].append(accuracy_score(Y_test,mlp_pred))
print(MLP_performance)

In [None]:
rf_performance = {"precision": [],"recall" : [], "f1" : [], "accuracy" : []}
forest = RandomForestClassifier(n_estimators = 500, criterion = "gini", max_features = "sqrt")
forest.fit(X_train,Y_train)
rf_pred = forest.predict(X_test)
rf_performance["precision"].append(precision_score(Y_test, rf_pred, average="macro"))
rf_performance["recall"].append(recall_score(Y_test, rf_pred, average="macro"))
rf_performance["f1"].append(f1_score(Y_test, rf_pred, average="macro"))
rf_performance["accuracy"].append(accuracy_score(Y_test,rf_pred))
print(rf_performance)

In [None]:
xg_performance = {"precision": [],"recall" : [], "f1" : [], "accuracy" : []}
xg_cl = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=10,
    n_estimators=10,
    max_depth=6,
    learning_rate=0.3,
    reg_lambda=1.0,
    reg_alpha=0.0
)
xg_cl.fit(X_train,Y_train)
xgb_pred = xg_cl.predict(X_test)
xg_performance["precision"].append(precision_score(Y_test, xgb_pred, average="macro"))
xg_performance["recall"].append(recall_score(Y_test, xgb_pred, average="macro"))
xg_performance["f1"].append(f1_score(Y_test, xgb_pred, average="macro"))
xg_performance["accuracy"].append(accuracy_score(Y_test,xgb_pred))
print(xg_performance)

In [None]:
classification_df = pd.DataFrame({"metric": ["Precision","Recall","F1","Accuracy"],"logistic_mean" : [np.mean(v) for k,v in logistic_performance.items()],
                          "MLP_mean" : [np.mean(v) for k,v in MLP_performance.items()],
                         "random_forest_mean" : [np.mean(v) for k,v in rf_performance.items()],
                         "xgb_mean" : [np.mean(v) for k,v in xg_performance.items()]})
print(classification_df)

classification_df.to_csv("performance_figure5B_gsets.tsv", sep='\t')

In [None]:
gsets_four = pd.read_csv("performance_figure5B_gsets.tsv", sep = "\t", header = 0, index_col = 0)

ind = np.arange(4)
fig, ax = plt.subplots(figsize = (25, 10))

log_bar = ax.bar(ind - 0.1, gsets_four.logistic_mean,alpha=1,width = 0.2,capsize = 10, color = '#66c2a5',align = 'center')
mlp_bar = ax.bar(ind + 0.1 , gsets_four.MLP_mean,alpha=1, width = 0.2,capsize = 10, color = '#fc8d62', align = 'center')
rf_bar = ax.bar(ind + 0.3, gsets_four.random_forest_mean,alpha=1, width = 0.2,capsize = 10, color = '#8da0cb',align = 'center')
xgb_bar = ax.bar(ind + 0.5, gsets_four.xgb_mean, alpha=1, width = 0.2,capsize = 10, color = '#e78ac3', align = 'center')
ax.set_ylabel("Model Performance", size = 38)
ax.set_ylim([0.60,1])
ax.set_xlabel("Metric", size = 38)
ax.set_xticks(ind + 0.2,labels = gsets_four.metric)
ax.set_xticklabels(gsets_four.metric,rotation = 0)
ax.tick_params(axis='both', labelsize=32)
ax.legend((log_bar[0], mlp_bar[0],rf_bar[0],xgb_bar[0]), ('Logistic Regression','MLP','Random Forest', 'XGB'),prop={'size': 22}, bbox_to_anchor=(1.23, 1), loc='upper right')
ax.xaxis.grid(False)
ax.grid(color='black', linestyle='-', linewidth=0.7)
ax.patch.set_edgecolor('black')  
ax.patch.set_linewidth('2')
fig.suptitle('Cell classification using gene sets instead of genes', fontsize=38)
ax.xaxis.labelpad = 20
ax.yaxis.labelpad = 20
vis = ax.get_figure()
vis.savefig("fig5B_bargraph.pdf", bbox_inches='tight')

# Figure 5C Confusion Matrix

In [None]:
#The logistic regression was the best performing algorithm and as a result it was used to create this confusion matrix. 
matrix = confusion_matrix(Y_test,logistic_pred, labels=np.unique(logistic_pred))
matrix = matrix/matrix.sum(axis=1, keepdims=True)*1
df_matrix = pd.DataFrame(matrix)
df_matrix.columns = np.unique(logistic_pred)
df_matrix.index = np.unique(logistic_pred)

sns.set(font_scale=0.65)
ax = sns.heatmap(df_matrix,cmap ="YlGnBu",annot = True,vmin = 0, vmax = 1,linewidth=0.5,annot_kws={"size": 6})
plt.show()
vis = ax.get_figure()
vis.savefig("Fig5C_confusionmtxgenesets.pdf", bbox_inches='tight')