# Figure 6 Single-Cell Decomposition using Hierarchical Autoencoder (scDHA)

### Using the 1000 most variable genes with binarized expression for classification

In [None]:
library(scDHA)

df = read.table("1000_variable_genes.csv", sep=",", header= TRUE)
df$X <- NULL
df$X1 <- NULL
df$X2 <- NULL
df <- df[df$cls_id != "CD4+ T Helper2",]
df <- df[df$cls_id != "CD34+",]

In [None]:
X <- df[,1:1000]
X <- as.matrix(X)
Y <- df[,1001]
X[X > 0] = 1 
X[X <= 0] = 0

In [None]:
set.seed(1)
idx <- sample.int(nrow(X), size = round(nrow(X)*0.80))
train.x <- X[idx, ]; train.y <- Y[idx]
test.x <- X[-idx, ]; test.y <- Y[-idx]

In [None]:
prediction <- scDHA.class(train = train.x, train.label = train.y, test = test.x)
acc <- round(sum(test.y == prediction)/length(test.y), 2)
print(paste0("Accuracy = ", acc))

df_predictions <- data.frame(true = test.y, predicted = prediction)
write.csv(x=df_predictions, file = "predictions_scDHA.csv")

### Using gene sets identified by gene set variation analysis of each cell within the 68k PBMC dataset

In [None]:
df = read.table("/.mounts/labs/reimandlab/private/users/mbayati/MBP1413/batch job gsva/ES_var_C7_68k.tsv", sep="\t", header= TRUE)
X <- df[,1:3006]
X <- as.matrix(X)
Y <- df[,3007]

In [None]:
set.seed(1)
idx <- sample.int(nrow(X), size = round(nrow(X)*0.80))
train.x <- X[idx, ]; train.y <- Y[idx]
test.x <- X[-idx, ]; test.y <- Y[-idx]

In [None]:
prediction <- scDHA.class(train = train.x, train.label = train.y, test = test.x)
acc <- round(sum(test.y == prediction)/length(test.y), 2)
print(paste0("Accuracy = ", acc))

df_predictions <- data.frame(true = test.y, predicted = prediction)
write.csv(x=df_predictions, file = "predictions_scDHA_gs.csv")
