# In this Notebook
We build the classifier we use in the adverSCarial notebook demos.

In [1]:
library(randomForest)
library(Seurat)
library(stringr)

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.

Attaching SeuratObject



In [2]:
# Seurat object
pbmc = readRDS("data/datasets/pbmc.rds")
Idents(pbmc) = "manual_annotation"

In [3]:
df_exprs = as.data.frame(t(as.data.frame(pbmc@assays$RNA@counts)))
df_exprs[,"type"] = unname(Idents(pbmc))

In [4]:
df_meta = pbmc@meta.data

In [5]:
df_exprs[,"type"] = as.factor(df_exprs[,"type"])
colnames(df_exprs) = str_replace_all(colnames(df_exprs),"-","_")
colnames(df_exprs) = str_replace(colnames(df_exprs),"^","g_")

In [6]:
df_exprs_4buildRF = df_exprs
ntree = 8
maxnodes = 75

In [7]:
set.seed(20)
rf <- randomForest(g_type~., data=df_exprs_4buildRF, ntree=ntree, maxnodes=maxnodes)

In [11]:
save(rf, file="classifiers/rf_scrnaseq")

In [8]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs_4buildRF,      ntree = ntree, maxnodes = maxnodes) 
               Type of random forest: classification
                     Number of trees: 8
No. of variables tried at each split: 117

        OOB estimate of  error rate: 35.85%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          222  17         10  1    23         188            0  0
B                      15 247          5  1     2          53            0  0
CD14+ Mono             12  14        400  3     2           9           26  2
NK                      3   4          4 72    52           5            0  0
CD8 T                  74  21          8 23    98          48            1  0
Naive CD4 T           120  31          5  0    12         517            1  1
FCGR3A+ Mono            2   1         84  0     0           1           70  1
DC                      2   1         15  1     1           0      

In [9]:
RFClassifier = function(expr, clusters, target, seed=1){
    set.seed(seed)
    colnames(expr) = str_replace_all(colnames(expr),"-","_")
    colnames(expr) = str_replace(colnames(expr),"^","g_")
    
    final_predictions <- predict(rf, expr[clusters==target,])
    ratio = as.numeric(sort(table(final_predictions), decreasing = T)[1]) / sum(as.numeric(sort(table(final_predictions), decreasing = T)))
    predicted_class = names(sort(table(final_predictions), decreasing = T)[1])
    if ( ratio < 0.5){
        predicted_class = "NA"
    }
    c(predicted_class, ratio)
}

In [10]:
RFClassifier(t(as.data.frame(pbmc@assays$RNA@counts)), df_exprs$g_type, "Memory CD4 T", seed=20)

In [8]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs, ntree = 20,      maxnodes = 80) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 29.83%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono  NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          138   3          3   0    13         315            0  0
B                       3 284          1   0     0          56            0  0
CD14+ Mono              1   1        461   0     0           8            9  0
NK                      1   1          1 101    32           8            0  0
CD8 T                  29   3          3  16    99         129            0  0
Naive CD4 T            42   3          0   1     3         662            0  0
FCGR3A+ Mono            0   0         60   0     0           4           98  0
DC                      1   0         19   0     0           0            6 

In [39]:
RFClassifier(t(as.data.frame(pbmc@assays$RNA@counts)), df_exprs$g_type, "Memory CD4 T", seed=19)

In [37]:
RFClassifier(t(as.data.frame(pbmc@assays$RNA@counts)), df_exprs$g_type, "Memory CD4 T", seed=20)

In [40]:
315/(138+3+3+13+315)

In [41]:
rf_prev = rf

In [9]:
load("classifiers/rf_seed20_ntree20_maxnodes80_CLONE")

In [10]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs_4buildRF,      ntree = ntree, maxnodes = maxnodes) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 26.84%
Confusion matrix:
             Naive CD4 T CD14+ Mono Memory CD4 T   B CD8 T FCGR3A+ Mono NK DC
Naive CD4 T          658          3           34   9     7            0  0  0
CD14+ Mono             7        461            0   1     0           11  0  0
Memory CD4 T         288          5          162   3    11            1  2  0
B                     26          4            4 309     1            0  0  0
CD8 T                 96          3           36   3   128            0 13  0
FCGR3A+ Mono           3         58            0   1     0          100  0  0
NK                    11          0            3   1    34            0 95  0
DC                     1         19            1   0     0        

In [43]:
RFClassifier(t(as.data.frame(pbmc@assays$RNA@counts)), df_exprs$g_type, "Memory CD4 T", seed=20)

In [32]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs, ntree = 20,      maxnodes = 80) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 29.83%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono  NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          138   3          3   0    13         315            0  0
B                       3 284          1   0     0          56            0  0
CD14+ Mono              1   1        461   0     0           8            9  0
NK                      1   1          1 101    32           8            0  0
CD8 T                  29   3          3  16    99         129            0  0
Naive CD4 T            42   3          0   1     3         662            0  0
FCGR3A+ Mono            0   0         60   0     0           4           98  0
DC                      1   0         19   0     0           0            6 

In [16]:
save(rf, file="classifiers/rf_seed20_ntree20_maxnodes80")

In [25]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs, ntree = 20,      maxnodes = 80) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 29.83%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono  NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          138   3          3   0    13         315            0  0
B                       3 284          1   0     0          56            0  0
CD14+ Mono              1   1        461   0     0           8            9  0
NK                      1   1          1 101    32           8            0  0
CD8 T                  29   3          3  16    99         129            0  0
Naive CD4 T            42   3          0   1     3         662            0  0
FCGR3A+ Mono            0   0         60   0     0           4           98  0
DC                      1   0         19   0     0           0            6 

In [17]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs, ntree = 20,      maxnodes = 80) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 29.83%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono  NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          138   3          3   0    13         315            0  0
B                       3 284          1   0     0          56            0  0
CD14+ Mono              1   1        461   0     0           8            9  0
NK                      1   1          1 101    32           8            0  0
CD8 T                  29   3          3  16    99         129            0  0
Naive CD4 T            42   3          0   1     3         662            0  0
FCGR3A+ Mono            0   0         60   0     0           4           98  0
DC                      1   0         19   0     0           0            6 

In [13]:
rf


Call:
 randomForest(formula = g_type ~ ., data = df_exprs, ntree = 20,      maxnodes = 80) 
               Type of random forest: classification
                     Number of trees: 20
No. of variables tried at each split: 117

        OOB estimate of  error rate: 24.49%
Confusion matrix:
             Memory CD4 T   B CD14+ Mono  NK CD8 T Naive CD4 T FCGR3A+ Mono DC
Memory CD4 T          179   1          1   1    11         279            0  0
B                       0 320          0   0     0          22            1  1
CD14+ Mono              0   2        465   0     0           6            7  0
NK                      3   0          0 124    15           2            0  0
CD8 T                  35   2          2  17   129          94            0  0
Naive CD4 T            45   1          1   0     5         659            0  0
FCGR3A+ Mono            0   1         55   0     0           1          105  0
DC                      0   4         11   0     0           1            9 