normalize between-batch affinities by rowwise magnitude of within-bat…

…ch affinities
KrishnaswamyLab · Nov 17, 2018 · 718d191 · 718d191
1 parent d63007b
commit 718d191
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 17 deletions.
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -911,9 +911,9 @@ class MNNGraph(DataGraph):
         Batch index
 
     beta : `float`, optional (default: 1)
-        Downweight within-batch affinities by beta
+        Downweight between-batch affinities by beta
 
-    adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: 'sqrt')
+    adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: None)
         Weights MNN kernel adaptively using the number of cells in
         each sample according to the selected method.
 
@@ -925,7 +925,7 @@ class MNNGraph(DataGraph):
 
     def __init__(self, data, sample_idx,
                  knn=5, beta=1, n_pca=None,
-                 adaptive_k='sqrt',
+                 adaptive_k=None,
                  decay=None,
                  bandwidth=None,
                  distance='euclidean',
@@ -1116,7 +1116,7 @@ def build_kernel(self):
                           verbose=self.verbose,
                           random_state=self.random_state,
                           n_jobs=self.n_jobs,
-                          initialize=False)
+                          initialize=True)
             self.subgraphs.append(graph)  # append to list of subgraphs
         tasklogger.log_complete("subgraphs")
 
@@ -1126,16 +1126,25 @@ def build_kernel(self):
         else:
             K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]])
         for i, X in enumerate(self.subgraphs):
+            K = set_submatrix(K, self.sample_idx == self.samples[i],
+                              self.sample_idx == self.samples[i], X.K)
+            within_batch_norm = np.array(np.sum(X.K, 1)).flatten()
             for j, Y in enumerate(self.subgraphs):
+                if i == j:
+                    continue
                 tasklogger.log_start(
                     "kernel from sample {} to {}".format(self.samples[i],
                                                          self.samples[j]))
                 Kij = Y.build_kernel_to_data(
                     X.data_nu,
                     knn=self.weighted_knn[i])
-                if i == j:
-                    # downweight within-batch affinities by beta
-                    Kij = Kij * self.beta
+                between_batch_norm = np.array(np.sum(Kij, 1)).flatten()
+                scale = np.minimum(1, within_batch_norm /
+                                   between_batch_norm) * self.beta
+                if sparse.issparse(Kij):
+                    Kij = Kij.multiply(scale[:, None])
+                else:
+                    Kij = Kij * scale[:, None]
                 K = set_submatrix(K, self.sample_idx == self.samples[i],
                                   self.sample_idx == self.samples[j], Kij)
                 tasklogger.log_complete(
@@ -1147,11 +1156,11 @@ def symmetrize_kernel(self, K):
         if self.kernel_symm == 'theta' and self.theta is not None and \
                 not isinstance(self.theta, numbers.Number):
             # matrix theta
-            # Gamma can be a matrix with specific values transitions for
+            # Theta can be a matrix with specific values transitions for
             # each batch. This allows for technical replicates and
             # experimental samples to be corrected simultaneously
             tasklogger.log_debug("Using theta symmetrization. "
-                                 "Gamma:\n{}".format(self.theta))
+                                 "Theta:\n{}".format(self.theta))
             for i, sample_i in enumerate(self.samples):
                 for j, sample_j in enumerate(self.samples):
                     if j < i:

diff --git a/test/test_mnn.py b/test/test_mnn.py
@@ -12,6 +12,7 @@
     raises,
     cdist,
 )
+from scipy.linalg import norm
 
 
 #####################################################
@@ -116,7 +117,7 @@ def test_mnn_graph_float_theta():
     k = 10
     a = 20
     metric = 'euclidean'
-    beta = 0
+    beta = 0.5
     samples = np.unique(sample_idx)
 
     K = np.zeros((len(X), len(X)))
@@ -133,17 +134,32 @@ def test_mnn_graph_float_theta():
             pdxe_ij = pdx_ij / e_ij[:, np.newaxis]  # normalize
             k_ij = np.exp(-1 * (pdxe_ij ** a))  # apply alpha-decaying kernel
             if si == sj:
-                K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \
-                    (1 - beta)  # fill out values in K for NN on diagonal
+                K.iloc[sample_idx == si, sample_idx == sj] = (
+                    k_ij + k_ij.T) / 2
             else:
                 # fill out values in K for NN on diagonal
                 K.iloc[sample_idx == si, sample_idx == sj] = k_ij
-
+    Kn = K.copy()
+    for i in samples:
+        curr_K = K.iloc[sample_idx == i, sample_idx == i]
+        i_norm = norm(curr_K, 1, axis=1)
+        for j in samples:
+            if i == j:
+                continue
+            else:
+                curr_K = K.iloc[sample_idx == i, sample_idx == j]
+                curr_norm = norm(curr_K, 1, axis=1)
+                scale = np.minimum(
+                    np.ones(len(curr_norm)), i_norm / curr_norm) * beta
+                Kn.iloc[sample_idx == i, sample_idx == j] = (
+                    curr_K.T * scale).T
+
+    K = Kn
     W = np.array((theta * np.minimum(K, K.T)) +
                  ((1 - theta) * np.maximum(K, K.T)))
     np.fill_diagonal(W, 0)
     G = pygsp.graphs.Graph(W)
-    G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta,
+    G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta,
                           kernel_symm='theta', theta=theta,
                           distance=metric, sample_idx=sample_idx, thresh=0,
                           use_pygsp=True)
@@ -179,11 +195,27 @@ def test_mnn_graph_matrix_theta():
             pdxe_ij = pdx_ij / e_ij[:, np.newaxis]  # normalize
             k_ij = np.exp(-1 * (pdxe_ij ** a))  # apply alpha-decaying kernel
             if si == sj:
-                K.iloc[sample_idx == si, sample_idx == sj] = k_ij * \
-                    (1 - beta)  # fill out values in K for NN on diagonal
+                K.iloc[sample_idx == si, sample_idx == sj] = (
+                    k_ij + k_ij.T) / 2
             else:
                 # fill out values in K for NN on diagonal
                 K.iloc[sample_idx == si, sample_idx == sj] = k_ij
+    Kn = K.copy()
+    for i in samples:
+        curr_K = K.iloc[sample_idx == i, sample_idx == i]
+        i_norm = norm(curr_K, 1, axis=1)
+        for j in samples:
+            if i == j:
+                continue
+            else:
+                curr_K = K.iloc[sample_idx == i, sample_idx == j]
+                curr_norm = norm(curr_K, 1, axis=1)
+                scale = np.minimum(
+                    np.ones(len(curr_norm)), i_norm / curr_norm) * beta
+                Kn.iloc[sample_idx == i, sample_idx == j] = (
+                    curr_K.T * scale).T
+
+    K = Kn
 
     K = np.array(K)
 
@@ -197,7 +229,7 @@ def test_mnn_graph_matrix_theta():
                  ((1 - matrix_theta) * np.maximum(K, K.T)))
     np.fill_diagonal(W, 0)
     G = pygsp.graphs.Graph(W)
-    G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=1 - beta,
+    G2 = graphtools.Graph(X, knn=k + 1, decay=a, beta=beta,
                           kernel_symm='theta', theta=theta,
                           distance=metric, sample_idx=sample_idx, thresh=0,
                           use_pygsp=True)