In [1]:
### Set up environment
## Import libraries
import os
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import anndata
from xgboost import XGBClassifier
import pickle
from sklearn.preprocessing import MinMaxScaler
import time
from sklearn.decomposition import PCA
import harmonypy as hm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/2022_12_07_new_multilin_panel/")

In [2]:
order_ari_groups = [\
    "HSCP", "CD127", "preMegE", "MkP", "preCFU-E",
    "CFU-E", "preGM", "Ly6C-1", "Ly6C-2", "Ly6C-3",
    "Ly6C-4", "W", "X", "Y", "Z",
    "IG2", "proNeu1", "proNeu2", "preNeu1", "preNeu2",
    "preNeu3", "immNeu1", "immNeu2"]

order_ari_groups = pd.Series(list(range(1,len(order_ari_groups)+1)),
    index=order_ari_groups)

order_ari_groups

HSCP         1
CD127        2
preMegE      3
MkP          4
preCFU-E     5
CFU-E        6
preGM        7
Ly6C-1       8
Ly6C-2       9
Ly6C-3      10
Ly6C-4      11
W           12
X           13
Y           14
Z           15
IG2         16
proNeu1     17
proNeu2     18
preNeu1     19
preNeu2     20
preNeu3     21
immNeu1     22
immNeu2     23
dtype: int64

In [10]:
### Load the data
# Mapped CITE-seq ADT and RNA
path_cite = "output/cite_inflow_integration_input/without_adt_umi_filtering/"\
    "mapped_cite_ari_adt_rna_all_cells.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_rna.fea"

# Read in the data
cite = pd.read_feather(path_cite).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "without_adt_umi_filtering/cell_anno_cite_ari_all_cells.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_all_cells_mapped_adt_rna = XGBClassifier(n_jobs=8, verbosity=3)
model_all_cells_mapped_adt_rna.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_all_cells_mapped_adt_rna = tmp_encoder.inverse_transform(\
    model_all_cells_mapped_adt_rna.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_all_cells_mapped_adt_rna ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_all_cells_mapped_adt_rna)))


[01:16:58] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 30 extra nodes, 0 pruned nodes, max_depth=6
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=6
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 44 extra nodes, 0 pruned nodes, max_depth=6
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 68 extra nodes, 0 pruned nodes, max_depth=6
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 22 extra nodes, 0 pruned nodes, max_depth=6
[01:16:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 36 extra nodes, 0 pruned nodes, max_depth=6
[01:16:59] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=4
[01:16:59] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 84 extra nodes, 0 pruned nodes, max_depth=6
[01:16:59] INFO: ../src/tree/updater_pr

In [11]:
### Load the data
# Pre_mapped CITE-seq ADT Alone
path_cite = "output/cite_inflow_integration_input/without_adt_umi_filtering/"\
    "pre_mapped_cite_ari_adt_alone_all_cells.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_alone.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "without_adt_umi_filtering/cell_anno_cite_ari_all_cells.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_all_cells_pre_mapped_adt_alone = XGBClassifier(n_jobs=8, verbosity=3)
model_all_cells_pre_mapped_adt_alone.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_all_cells_pre_mapped_adt_alone = tmp_encoder.inverse_transform(\
    model_all_cells_pre_mapped_adt_alone.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_all_cells_pre_mapped_adt_alone ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_all_cells_pre_mapped_adt_alone)))


[01:18:34] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:18:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 86 extra nodes, 0 pruned nodes, max_depth=6
[01:18:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=6
[01:18:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 92 extra nodes, 0 pruned nodes, max_depth=6
[01:18:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=6
[01:18:35] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 22 extra nodes, 0 pruned nodes, max_depth=6
[01:18:35] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 38 extra nodes, 0 pruned nodes, max_depth=6
[01:18:35] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=4
[01:18:35] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 84 extra nodes, 0 pruned nodes, max_depth=6
[01:18:35] INFO: ../src/tree/updater_pr

In [12]:
### Load the data
# Pre_mapped CITE-seq ADT and RNA
path_cite = "output/cite_inflow_integration_input/without_adt_umi_filtering/"\
    "pre_mapped_cite_ari_adt_rna_all_cells.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_rna.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "without_adt_umi_filtering/cell_anno_cite_ari_all_cells.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_all_cells_pre_mapped_adt_rna = XGBClassifier(n_jobs=8, verbosity=3)
model_all_cells_pre_mapped_adt_rna.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_all_cells_pre_mapped_adt_rna = tmp_encoder.inverse_transform(\
    model_all_cells_pre_mapped_adt_rna.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_all_cells_pre_mapped_adt_rna ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_all_cells_pre_mapped_adt_rna)))


[01:19:40] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 32 extra nodes, 0 pruned nodes, max_depth=6
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=6
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 48 extra nodes, 0 pruned nodes, max_depth=6
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 68 extra nodes, 0 pruned nodes, max_depth=6
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 22 extra nodes, 0 pruned nodes, max_depth=6
[01:19:40] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 38 extra nodes, 0 pruned nodes, max_depth=6
[01:19:41] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=4
[01:19:41] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 82 extra nodes, 0 pruned nodes, max_depth=6
[01:19:41] INFO: ../src/tree/updater_pr

In [13]:
### Load the data
# Mapped CITE-seq mid 50 quantile ADT and RNA
path_cite = "output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "mapped_cite_ari_adt_rna_middle_50_percentile.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_rna.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "cell_anno_cite_ari_middle_50_percentile_adt_umi.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_mid50_mapped_adt_rna = XGBClassifier(n_jobs=8, verbosity=3)
model_mid50_mapped_adt_rna.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_mid50_mapped_adt_rna = tmp_encoder.inverse_transform(\
    model_mid50_mapped_adt_rna.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_mid50_mapped_adt_rna)))


[01:21:03] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:21:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 32 extra nodes, 0 pruned nodes, max_depth=6
[01:21:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[01:21:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 34 extra nodes, 0 pruned nodes, max_depth=6
[01:21:04] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 46 extra nodes, 0 pruned nodes, max_depth=6
[01:21:04] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=4
[01:21:04] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=5
[01:21:04] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4 extra nodes, 0 pruned nodes, max_depth=2
[01:21:04] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 56 extra nodes, 0 pruned nodes, max_depth=6
[01:21:04] INFO: ../src/tree/updater_pru

In [14]:
### Load the data
# Pre_mapped CITE-seq mid 50 quantile ADT Alone
path_cite = "output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "pre_mapped_cite_ari_adt_alone_middle_50_percentile.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_alone.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "cell_anno_cite_ari_middle_50_percentile_adt_umi.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_mid50_pre_mapped_adt_alone = XGBClassifier(n_jobs=8, verbosity=3)
model_mid50_pre_mapped_adt_alone.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_mid50_pre_mapped_adt_alone = tmp_encoder.inverse_transform(\
    model_mid50_pre_mapped_adt_alone.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_mid50_pre_mapped_adt_alone ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_mid50_pre_mapped_adt_alone)))


[01:21:34] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 74 extra nodes, 0 pruned nodes, max_depth=6
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 42 extra nodes, 0 pruned nodes, max_depth=6
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 16 extra nodes, 0 pruned nodes, max_depth=6
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4 extra nodes, 0 pruned nodes, max_depth=2
[01:21:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[01:21:34] INFO: ../src/tree/updater_pru

In [15]:
### Load the data
# Pre_mapped CITE-seq mid 50 quantile ADT and RNA
path_cite = "output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "pre_mapped_cite_ari_adt_rna_middle_50_percentile.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_rna.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "cell_anno_cite_ari_middle_50_percentile_adt_umi.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_mid50_pre_mapped_adt_rna = XGBClassifier(n_jobs=8, verbosity=3)
model_mid50_pre_mapped_adt_rna.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_mid50_pre_mapped_adt_rna = tmp_encoder.inverse_transform(\
    model_mid50_pre_mapped_adt_rna.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_mid50_pre_mapped_adt_rna ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_mid50_pre_mapped_adt_rna)))


[01:21:56] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 32 extra nodes, 0 pruned nodes, max_depth=6
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 48 extra nodes, 0 pruned nodes, max_depth=6
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 46 extra nodes, 0 pruned nodes, max_depth=6
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=4
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=5
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4 extra nodes, 0 pruned nodes, max_depth=2
[01:21:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 60 extra nodes, 0 pruned nodes, max_depth=6
[01:21:56] INFO: ../src/tree/updater_pru

In [16]:
### Load the data
# Mapped CITE-seq mid 50 quantile ADT alone
path_cite = "output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "mapped_cite_ari_adt_alone_middle_50_percentile.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_alone.fea"

# Read in the data
cite = pd.read_feather(path_cite).rename({"final": "index"}, axis=1).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "middle_50_percentile_adt_umi_filtering/"\
    "cell_anno_cite_ari_middle_50_percentile_adt_umi.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_mid50_mapped_adt_alone = XGBClassifier(n_jobs=8, verbosity=3)
model_mid50_mapped_adt_alone.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_mid50_mapped_adt_alone = tmp_encoder.inverse_transform(\
    model_mid50_mapped_adt_alone.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("model_mid50_mapped_adt_alone ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_mid50_mapped_adt_alone)))


[01:22:24] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 76 extra nodes, 0 pruned nodes, max_depth=6
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 42 extra nodes, 0 pruned nodes, max_depth=6
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 16 extra nodes, 0 pruned nodes, max_depth=6
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4 extra nodes, 0 pruned nodes, max_depth=2
[01:22:25] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 54 extra nodes, 0 pruned nodes, max_depth=6
[01:22:25] INFO: ../src/tree/updater_pru

In [17]:
### Load the data
# Mapped CITE-seq ADT alone
path_cite = "output/cite_inflow_integration_input/without_adt_umi_filtering/"\
    "mapped_cite_ari_adt_alone_all_cells.fea"
path_flow = "output/cite_inflow_integration_input/gated_inflow_inputs/"\
    "inflow_gated_1k_each_ari_groups_adt_alone.fea"

# Read in the data
cite = pd.read_feather(path_cite).set_index("index")
flow = pd.read_feather(path_flow)

# Read the annotation
cite_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "without_adt_umi_filtering/cell_anno_cite_ari_all_cells.csv")
flow_anno = pd.read_csv("output/cite_inflow_integration_input/"\
    "gated_inflow_inputs/inflow_gated_1k_each_ari_groups_adt_alone_groups.csv", 
    header=None)

# Set up the annotation to fig class input
tmp_anno_cite = cite_anno[["cell", "ari_group"]]
tmp_anno_cite.columns = ["cell", "group"]
tmp_anno_flow = flow_anno
tmp_anno_flow.columns = ["cell", "group"]

tmp_encoder = LabelEncoder()
cells_encoded = tmp_encoder.fit_transform(\
    tmp_anno_cite["group"].values)

model_all_cells_mapped_adt_alone = XGBClassifier(n_jobs=8, verbosity=3)
model_all_cells_mapped_adt_alone.fit(\
    cite.loc[tmp_anno_cite["cell"].values].values,
    cells_encoded)

preds_all_cells_mapped_adt_alone = tmp_encoder.inverse_transform(\
    model_all_cells_mapped_adt_alone.predict(\
        flow.loc[tmp_anno_flow["cell"].values].values))

print("All cells mapped adt alone, ARI score: {}".format(\
    adjusted_rand_score(\
        tmp_anno_flow["group"].values, 
        preds_all_cells_mapped_adt_alone)))


[01:22:45] DEBUG: ../src/gbm/gbtree.cc:156: Using tree method: 2
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 82 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 94 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 20 extra nodes, 0 pruned nodes, max_depth=5
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 38 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=4
[01:22:46] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 84 extra nodes, 0 pruned nodes, max_depth=6
[01:22:46] INFO: ../src/tree/updater_pr