From 7bf7e90a358d9b0ea2112925b77769b8041ae966 Mon Sep 17 00:00:00 2001
From: Johannes Weytjens <johannes.weytjens@telenet.be>
Date: Mon, 8 Jul 2019 13:30:45 +0200
Subject: [PATCH 1/5] implement max weighted bipartite matching with networkx

---
 recordlinkage/network.py | 142 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 131 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 recordlinkage/network.py

diff --git a/recordlinkage/network.py b/recordlinkage/network.py
old mode 100644
new mode 100755
index 8cbab09d..3dd2addb
--- a/recordlinkage/network.py
+++ b/recordlinkage/network.py
@@ -1,7 +1,9 @@
 import pandas as pd
+import networkx as nx
 
 from recordlinkage.types import is_pandas_2d_multiindex
 from recordlinkage.types import is_pandas_multiindex
+from recordlinkage.types import is_pandas_like
 
 
 class OneToOneLinking(object):
@@ -16,8 +18,7 @@ class OneToOneLinking(object):
     Parameters
     ----------
     method : str
-        The method to solve the problem. Only 'greedy' is supported at
-        the moment.
+        The method to solve the problem. The options are 'greedy' and 'max_weighted'.
 
     Note
     ----
@@ -31,6 +32,21 @@ def __init__(self, method='greedy'):
 
         self.method = method
 
+    def _add_similarity_weights(self, links, data):
+        """Add the similarity weights to the MultiIndex with candidate links."""
+
+        #  calculate the total weight and remove all other columns
+        initial_columns = data.columns
+        data["weight"] = data.sum(axis=1)
+
+        # slicing on a multiindex is equivalent to merging on two columns
+        data = data.drop(columns=initial_columns).reset_index()
+        links = links.to_frame(index=False).rename(columns={0: "level_0", 1: "level_1"})
+        links = links.merge(data, how="left", on=["level_0", "level_1"]).set_index(["level_0", "level_1"])
+        links.index.names = [None, None]
+
+        return links
+
     @classmethod
     def _bool_duplicated(cls, links, level):
 
@@ -50,7 +66,42 @@ def _compute_greedy(self, links):
 
         return pd.MultiIndex.from_tuples(result)
 
-    def _compute(self, links):
+    def _compute_max_weighted(self, links, data):
+        """Compute a one to one linking by maximizing the total similarity weight."""
+
+        links = self._add_similarity_weights(links, data)
+        graph = self._to_weighted_bipartite_graph(links)
+
+        max_weighted_graph = self._max_weighted_graph(graph)
+        max_weighted_dataframe = self._to_max_weighted_dataframe(max_weighted_graph)
+
+        return max_weighted_dataframe
+
+    def _max_weighted_graph(self, graph):
+        """Calculate the maximally weighted bipartite graph."""
+
+        # max weight matching
+        max_weighted_edges = nx.algorithms.matching.max_weight_matching(graph)
+
+        # restore order after matching
+        max_weighted_edges = self._order_max_weighted_bipartite_graph(graph, max_weighted_edges)
+
+        # create maximally weighted graph
+        weights = [graph[u][v]["weight"] for u, v in max_weighted_edges]
+        max_weighted_left = [edge[0] for edge in max_weighted_edges]
+        max_weighted_right = [edge[1] for edge in max_weighted_edges]
+
+        max_weighted_graph = nx.Graph()
+
+        max_weighted_graph.add_nodes_from(max_weighted_left, bipartite=0)
+        max_weighted_graph.add_nodes_from(max_weighted_right, bipartite=1)
+        max_weighted_graph.add_weighted_edges_from(
+            list(zip(max_weighted_left, max_weighted_right, weights))
+        )
+
+        return max_weighted_graph
+
+    def _compute(self, links, data):
         if not is_pandas_2d_multiindex(links):
             if not is_pandas_multiindex(links):
                 raise TypeError("expected pandas.MultiIndex")
@@ -58,19 +109,93 @@ def _compute(self, links):
                 raise ValueError(
                     "pandas.MultiIndex has incorrect number of "
                     "levels (expected 2 levels)")
+        if data and (not is_pandas_like(data)):
+            raise TypeError("expected pandas.DataFrame")
 
-        if self.method == 'greedy':
+        if self.method == "greedy":
             return self._compute_greedy(links)
+        elif self.method == "max_weighted":
+            return self._compute_max_weighted(links, data)
         else:
             raise ValueError("unknown matching method {}".format(self.method))
 
-    def compute(self, links):
+    def _order_max_weighted_bipartite_graph(self, graph, max_weighted_edges):
+        """Swaps the order of edges that are swapped after max. weight matching."""
+
+        edges_left = list(set(edge[0] for edge in graph.edges))
+
+        max_weighted_left = [edge[0] for edge in max_weighted_edges]
+        max_weighted_right = [edge[1] for edge in max_weighted_edges]
+
+        for i, value in enumerate(max_weighted_left):
+            if value not in edges_left:
+                max_weighted_left[i], max_weighted_right[i] = (
+                    max_weighted_right[i],
+                    max_weighted_left[i],
+                )
+
+        ordered_max_weighted_edges = list(zip(max_weighted_left, max_weighted_right))
+
+        return ordered_max_weighted_edges
+
+    def _to_weighted_bipartite_graph(self, links):
+        """Convert a pandas DataFrame with MultiIndex and single column weight to a bipartite graph with weighted edges."""
+
+        # add labels to both multiindex levels to ensure no overlap of nodes in the graph
+        links = links.set_index(self._add_node_labels_to_multiindex(links.index))
+        links = links.reset_index()
+
+        # create the graph
+        graph = nx.Graph()
+
+        graph.add_nodes_from(links["level_0"], bipartite=0)
+        graph.add_nodes_from(links["level_1"], bipartite=1)
+
+        graph.add_weighted_edges_from(
+            list(zip(links["level_0"], links["level_1"], links["weight"]))
+        )
+
+        return graph
+
+    def _to_max_weighted_dataframe(self, graph):
+        """Convert a (max weighted) bipartite graph to a DataFrame."""
+
+        max_weighted_dataframe = nx.to_pandas_edgelist(graph)
+
+        # ensure output format is the same as the format of the initial candidate links
+        max_weighted_dataframe = max_weighted_dataframe.set_index(["source", "target"])
+        max_weighted_dataframe.index.names = [None, None]
+        max_weighted_dataframe = max_weighted_dataframe.set_index(
+            self._remove_node_labels_from_multiindex(max_weighted_dataframe.index)
+        ).sort_index(level=0)
+
+        return max_weighted_dataframe
+
+    def _add_node_labels_to_multiindex(self, multiindex, labels=["left_", "right_"]):
+
+        for i, (level, dataset) in enumerate(zip(multiindex.levels, labels)):
+            stringified_level = [dataset + str(value) for value in level]
+            multiindex = multiindex.set_levels(stringified_level, i)
+
+        return multiindex
+
+    def _remove_node_labels_from_multiindex(self, multiindex, labels=["left_", "right_"]):
+
+        for i, (level, label) in enumerate(zip(multiindex.levels, labels)):
+            destringified_level = [int(value.replace(label, "")) for value in level]
+            multiindex = multiindex.set_levels(destringified_level, i)
+
+        return multiindex
+
+    def compute(self, links, data=None):
         """Compute the one-to-one linking.
 
         Parameters
         ----------
         links : pandas.MultiIndex
             The pairs to apply linking to.
+        data : pandas.DataFrame
+            The similary weights computed for the entire dataset.
 
         Returns
         -------
@@ -79,7 +204,7 @@ def compute(self, links):
 
         """
 
-        return self._compute(links)
+        return self._compute(links, data)
 
 
 class OneToManyLinking(OneToOneLinking):
@@ -181,11 +306,6 @@ def compute(self, links):
 
         """
 
-        try:
-            import networkx as nx
-        except ImportError():
-            raise Exception("'networkx' module is needed for this operation")
-
         G = nx.Graph()
         G.add_edges_from(links.values)
         connected_components = nx.connected_component_subgraphs(G)

From a6c237d99bc1d6c5d2b66350134d34b6228f887d Mon Sep 17 00:00:00 2001
From: Johannes Weytjens <johannes.weytjens@telenet.be>
Date: Mon, 8 Jul 2019 14:07:32 +0200
Subject: [PATCH 2/5] add test for max weighted matching

---
 recordlinkage/network.py |  6 +++---
 tests/test_network.py    | 18 +++++++++++++++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/recordlinkage/network.py b/recordlinkage/network.py
index 3dd2addb..019214f2 100755
--- a/recordlinkage/network.py
+++ b/recordlinkage/network.py
@@ -109,7 +109,7 @@ def _compute(self, links, data):
                 raise ValueError(
                     "pandas.MultiIndex has incorrect number of "
                     "levels (expected 2 levels)")
-        if data and (not is_pandas_like(data)):
+        if (data is not None) and (not is_pandas_like(data)):
             raise TypeError("expected pandas.DataFrame")
 
         if self.method == "greedy":
@@ -257,7 +257,7 @@ def _compute_greedy(self, links):
         source_dupl_bool = self._bool_duplicated(links, self.level)
         return links[~source_dupl_bool]
 
-    def compute(self, links):
+    def compute(self, links, data=None):
         """Compute the one-to-many matching.
 
         Parameters
@@ -272,7 +272,7 @@ def compute(self, links):
 
         """
 
-        return self._compute(links)
+        return self._compute(links, data)
 
 
 class ConnectedComponents(object):
diff --git a/tests/test_network.py b/tests/test_network.py
index fac8382f..ac4ec70e 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -10,7 +10,7 @@
                            ConnectedComponents)
 
 
-def test_one_to_one_linking():
+def test_one_to_one_linking_greedy():
 
     sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),
                                         (4, 4), (5, 5), (6, 5), (7, 7), (7, 7),
@@ -23,6 +23,22 @@ def test_one_to_one_linking():
     ptm.assert_index_equal(sample_one_to_many, expected)
 
 
+def test_one_to_one_linking_max_weighted():
+
+    sample_index = pd.MultiIndex.from_tuples([(1, 1), (2, 1), (2, 2), (2, 3), (3, 3)])
+    sample_data = {"c1": [0, 1, 0, 1, 0], "c2": [1, 1, 1, 1, 1], "c3": [1, 1, 1, 1, 1]}
+    sample = pd.DataFrame(data=sample_data, index=sample_index)
+
+    one_to_one = OneToOneLinking(method="max_weighted")
+    sample_one_to_one = one_to_one.compute(sample_index, sample)
+
+    expected_index = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)])
+    expected_data = {"weight": [2, 2, 2]}
+    expected = pd.DataFrame(data=expected_data, index=expected_index)
+
+    ptm.assert_frame_equal(sample_one_to_one, expected)
+
+
 def test_one_to_many_linking():
 
     sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),

From 0024f78032d71779d40c7955b847e966173a5886 Mon Sep 17 00:00:00 2001
From: Johannes Weytjens <johannes.weytjens@telenet.be>
Date: Wed, 27 Nov 2019 11:44:57 +0100
Subject: [PATCH 3/5] use Series instead of DataFrame; generalize add_weights
 function

---
 recordlinkage/network.py | 215 +++++++++++++++++++++++++++------------
 1 file changed, 148 insertions(+), 67 deletions(-)

diff --git a/recordlinkage/network.py b/recordlinkage/network.py
index 019214f2..79db769c 100755
--- a/recordlinkage/network.py
+++ b/recordlinkage/network.py
@@ -1,9 +1,10 @@
-import pandas as pd
 import networkx as nx
-
-from recordlinkage.types import is_pandas_2d_multiindex
-from recordlinkage.types import is_pandas_multiindex
-from recordlinkage.types import is_pandas_like
+import pandas as pd
+from recordlinkage.types import (
+    is_pandas_2d_multiindex,
+    is_pandas_like,
+    is_pandas_multiindex,
+)
 
 
 class OneToOneLinking(object):
@@ -18,7 +19,7 @@ class OneToOneLinking(object):
     Parameters
     ----------
     method : str
-        The method to solve the problem. The options are 'greedy' and 'max_weighted'.
+        The method to solve the problem. The options are 'greedy' and "max_weighted". The "max_weighted" option solves the assignment problem, i.e. it finds the one to one matching with the greatest combined weight of all links. The matching is done with the Blossom algorithm by Jack Edmonds as implemented in networkx. For more details, see https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.matching.max_weight_matching.html.
 
     Note
     ----
@@ -27,23 +28,79 @@ class OneToOneLinking(object):
 
     """
 
-    def __init__(self, method='greedy'):
+    def __init__(self, method="greedy"):
         super(OneToOneLinking, self).__init__()
 
         self.method = method
 
-    def _add_similarity_weights(self, links, data):
-        """Add the similarity weights to the MultiIndex with candidate links."""
+    def add_weights(self, links, features=None, classifier=None, method="weights"):
+        """Add match weights to the candidate matches.
+
+        Parameters
+        ----------
+        links : pandas.MultiIndex
+            The candidate matches as calculated by a recordlinkage classifier.
+        features : pandas.DataFrame
+            The dataframe with similarity weights as calculated by a recordlinkage.Compare object.
+        classifier : recordlinkage.base.Classifier
+            The classifier used to classify the records in matches and non-matches.
+        method : str
+            The method to assign weights to the candidate matces. The options are 'weights', 'log_weights' and 'probabilities'. The 'weights' features requires the features to be passed. It adds the sum of the similarity weights from features to the links. Both 'log_weights' and 'probabilities' require that the classifier is passed. 'log_weights' adds the matching weight as defined in the Fellegi-Sunter framework. These weights can be negative, but the "max_weighted" linking strategy can't handle negative weights. All matching weights are offset with the largest total negative matching weight, so all the weights are greater than or equal to 0. This method is only available for the ECM and NaiveBayes classifier. 'probabilities' adds the probabilities that the record pair is a match as a weight. This method is available for every classifier.
+
+        Returns
+        -------
+        pandas.Series
+
+        Example
+        -------
+
+        Consider a MultiIndex with record pairs constructed from datasets A
+        and B. The candidate matches are determined with a classifier. To link a candidate match from A to at most one record of B with the "max_weighted" method, weights need to be added to the candidate matches. This can be done by using the features or the classifier. Given the following set up:
+
+            > indexer = Index()
+            > indexer.full()
+            > record_pairs = indexer.index(A, B)
+            > comparator = Compare(compare.String("A_string", "B_string"))
+            > features = comparator.compute(record_pairs, A, B)
+            > ecm = ECMClassifier()
+            > candidates = ecm.fit_predict(features)
+            > one_to_one = OneToOneLinking(method="max_weighted")
+
+        Weights can be added with the following syntax:
+
+            > candidates_weights = one_to_one.add_weights(candidates, features=features, method="weights")
+            > candidates_log_weights = one_to_one.add_weights(candidates, classifier=ecm, method="log_weights")
+
+        """
+
+        # get subset of data that correponds with the multiindex links
+        difference = features.index.difference(links)
+        features = features.drop(index=difference)
+
+        if method == "weights" or method == "log_weights":
+            initial_columns = features.columns
+
+            if method == "weights":
+                weight = features.sum(axis=1)
+
+            elif method == "log_weights":
+                # calculate the total log weight for each row
+                weight = pd.Series(0, index=features.index)
+                for column, weights in classifier.log_weights.items():
+                    weight += features[column].apply(lambda x: weights[x])
+
+                # offset negative values
+                min_weight = weight.min()
+                if min_weight < 0:
+                    weight = weight - min_weight
 
-        #  calculate the total weight and remove all other columns
-        initial_columns = data.columns
-        data["weight"] = data.sum(axis=1)
+            #  add the weight and remove all other columns
+            features = features.assign(weight=weight)
+            links = features.drop(columns=initial_columns).squeeze()
 
-        # slicing on a multiindex is equivalent to merging on two columns
-        data = data.drop(columns=initial_columns).reset_index()
-        links = links.to_frame(index=False).rename(columns={0: "level_0", 1: "level_1"})
-        links = links.merge(data, how="left", on=["level_0", "level_1"]).set_index(["level_0", "level_1"])
-        links.index.names = [None, None]
+        elif method == "probabilities":
+
+            links = classifier.prob(features)
 
         return links
 
@@ -66,25 +123,25 @@ def _compute_greedy(self, links):
 
         return pd.MultiIndex.from_tuples(result)
 
-    def _compute_max_weighted(self, links, data):
+    def _compute_max_weighted(self, links):
         """Compute a one to one linking by maximizing the total similarity weight."""
 
-        links = self._add_similarity_weights(links, data)
         graph = self._to_weighted_bipartite_graph(links)
-
         max_weighted_graph = self._max_weighted_graph(graph)
-        max_weighted_dataframe = self._to_max_weighted_dataframe(max_weighted_graph)
+        max_weighted_series = self._to_max_weighted_series(max_weighted_graph)
 
-        return max_weighted_dataframe
+        return max_weighted_series
 
     def _max_weighted_graph(self, graph):
-        """Calculate the maximally weighted bipartite graph."""
+        """Calculate the maximally weighted bipartite graph with the Blossom algorithm by Edmonds."""
 
         # max weight matching
         max_weighted_edges = nx.algorithms.matching.max_weight_matching(graph)
 
         # restore order after matching
-        max_weighted_edges = self._order_max_weighted_bipartite_graph(graph, max_weighted_edges)
+        max_weighted_edges = self._order_max_weighted_bipartite_graph(
+            graph, max_weighted_edges
+        )
 
         # create maximally weighted graph
         weights = [graph[u][v]["weight"] for u, v in max_weighted_edges]
@@ -101,26 +158,36 @@ def _max_weighted_graph(self, graph):
 
         return max_weighted_graph
 
-    def _compute(self, links, data):
-        if not is_pandas_2d_multiindex(links):
+    def _compute(self, links):
+
+        if self.method == "greedy":
             if not is_pandas_multiindex(links):
                 raise TypeError("expected pandas.MultiIndex")
-            elif not is_pandas_2d_multiindex(links):
+            if not is_pandas_2d_multiindex(links):
                 raise ValueError(
                     "pandas.MultiIndex has incorrect number of "
-                    "levels (expected 2 levels)")
-        if (data is not None) and (not is_pandas_like(data)):
-            raise TypeError("expected pandas.DataFrame")
+                    "levels (expected 2 levels)"
+                )
 
-        if self.method == "greedy":
             return self._compute_greedy(links)
+
         elif self.method == "max_weighted":
-            return self._compute_max_weighted(links, data)
+            if not is_pandas_like(links):
+                raise TypeError(
+                    "expected pandas.Series with a MultiIndex and weights as values"
+                )
+            if not is_pandas_2d_multiindex(links.index):
+                raise ValueError(
+                    "pandas.MultiIndex has incorrect number of "
+                    "levels (expected 2 levels)"
+                )
+            return self._compute_max_weighted(links)
+
         else:
             raise ValueError("unknown matching method {}".format(self.method))
 
     def _order_max_weighted_bipartite_graph(self, graph, max_weighted_edges):
-        """Swaps the order of edges that are swapped after max. weight matching."""
+        """Swaps the order of edges that are swapped after max weight matching."""
 
         edges_left = list(set(edge[0] for edge in graph.edges))
 
@@ -139,39 +206,45 @@ def _order_max_weighted_bipartite_graph(self, graph, max_weighted_edges):
         return ordered_max_weighted_edges
 
     def _to_weighted_bipartite_graph(self, links):
-        """Convert a pandas DataFrame with MultiIndex and single column weight to a bipartite graph with weighted edges."""
+        """Convert a Series with MultiIndex and weights to a bipartite graph with weighted edges."""
 
+        # don't change the passed series
+        tmp = links.copy()
         # add labels to both multiindex levels to ensure no overlap of nodes in the graph
-        links = links.set_index(self._add_node_labels_to_multiindex(links.index))
-        links = links.reset_index()
+        tmp.index = self._add_node_labels_to_multiindex(tmp.index)
 
         # create the graph
         graph = nx.Graph()
 
-        graph.add_nodes_from(links["level_0"], bipartite=0)
-        graph.add_nodes_from(links["level_1"], bipartite=1)
+        left = tmp.index.levels[0]
+        right = tmp.index.levels[1]
+        values = tmp.values
 
-        graph.add_weighted_edges_from(
-            list(zip(links["level_0"], links["level_1"], links["weight"]))
-        )
+        graph.add_nodes_from(left, bipartite=0)
+        graph.add_nodes_from(right, bipartite=1)
+
+        graph.add_weighted_edges_from(list(zip(left, right, values)))
 
         return graph
 
-    def _to_max_weighted_dataframe(self, graph):
-        """Convert a (max weighted) bipartite graph to a DataFrame."""
+    def _to_max_weighted_series(self, graph):
+        """Convert a (max weighted) bipartite graph to a Series."""
 
-        max_weighted_dataframe = nx.to_pandas_edgelist(graph)
+        max_weighted_series = nx.to_pandas_edgelist(graph)
 
         # ensure output format is the same as the format of the initial candidate links
-        max_weighted_dataframe = max_weighted_dataframe.set_index(["source", "target"])
-        max_weighted_dataframe.index.names = [None, None]
-        max_weighted_dataframe = max_weighted_dataframe.set_index(
-            self._remove_node_labels_from_multiindex(max_weighted_dataframe.index)
-        ).sort_index(level=0)
+        max_weighted_series = max_weighted_series.set_index(
+            ["source", "target"]
+        ).squeeze()
+        max_weighted_series.index.names = [None, None]
+        max_weighted_series.index = self._remove_node_labels_from_multiindex(
+            max_weighted_series.index
+        )
 
-        return max_weighted_dataframe
+        return max_weighted_series
 
     def _add_node_labels_to_multiindex(self, multiindex, labels=["left_", "right_"]):
+        """Adds labels to a MultiIndex. This is done in order to distinguish the left and right dataset during the max weighted matching algorithm."""
 
         for i, (level, dataset) in enumerate(zip(multiindex.levels, labels)):
             stringified_level = [dataset + str(value) for value in level]
@@ -179,7 +252,9 @@ def _add_node_labels_to_multiindex(self, multiindex, labels=["left_", "right_"])
 
         return multiindex
 
-    def _remove_node_labels_from_multiindex(self, multiindex, labels=["left_", "right_"]):
+    def _remove_node_labels_from_multiindex(
+        self, multiindex, labels=["left_", "right_"]
+    ):
 
         for i, (level, label) in enumerate(zip(multiindex.levels, labels)):
             destringified_level = [int(value.replace(label, "")) for value in level]
@@ -187,24 +262,22 @@ def _remove_node_labels_from_multiindex(self, multiindex, labels=["left_", "righ
 
         return multiindex
 
-    def compute(self, links, data=None):
+    def compute(self, links):
         """Compute the one-to-one linking.
 
         Parameters
         ----------
-        links : pandas.MultiIndex
-            The pairs to apply linking to.
-        data : pandas.DataFrame
-            The similary weights computed for the entire dataset.
+        links : pandas.MultiIndex or pandas.Series
+            The pairs to apply linking to. Should be a pandas.MultiIndex for the 'greedy' and a pandas.Series for the 'max_weighted' method.
 
         Returns
         -------
-        pandas.MultiIndex
-            A one-to-one matched MultiIndex of record pairs.
+        pandas.MultiIndex or pandas.Series
+            A one-to-one matched MultiIndex of record pairs for the 'greedy' method and a pandas.Series with one-to-one matched record pairs and their matching weight.
 
         """
 
-        return self._compute(links, data)
+        return self._compute(links)
 
 
 class OneToManyLinking(OneToOneLinking):
@@ -247,7 +320,7 @@ class OneToManyLinking(OneToOneLinking):
 
     """
 
-    def __init__(self, level=0, method='greedy'):
+    def __init__(self, level=0, method="greedy"):
         super(OneToManyLinking, self).__init__(method=method)
 
         self.level = level
@@ -257,7 +330,7 @@ def _compute_greedy(self, links):
         source_dupl_bool = self._bool_duplicated(links, self.level)
         return links[~source_dupl_bool]
 
-    def compute(self, links, data=None):
+    def compute(self, links):
         """Compute the one-to-many matching.
 
         Parameters
@@ -272,7 +345,7 @@ def compute(self, links, data=None):
 
         """
 
-        return self._compute(links, data)
+        return self._compute(links)
 
 
 class ConnectedComponents(object):
@@ -306,11 +379,19 @@ def compute(self, links):
 
         """
 
-        G = nx.Graph()
-        G.add_edges_from(links.values)
-        connected_components = nx.connected_component_subgraphs(G)
+        try:
+            import networkx as nx
+        except ImportError():
+            raise Exception("'networkx' module is needed for this operation")
+
+        graph_pairs = nx.Graph()
+        graph_pairs.add_edges_from(links.values)
+        connected_pairs = (
+            graph_pairs.subgraph(c).copy() for c in nx.connected_components(graph_pairs)
+        )
 
-        links_result = [pd.MultiIndex.from_tuples(subgraph.edges())
-                        for subgraph in connected_components]
+        links_result = [
+            pd.MultiIndex.from_tuples(subgraph.edges()) for subgraph in connected_pairs
+        ]
 
         return links_result

From 0f57dfd08a1c04f805084873218469c4ee758a83 Mon Sep 17 00:00:00 2001
From: Johannes Weytjens <johannes.weytjens@telenet.be>
Date: Wed, 27 Nov 2019 12:48:06 +0100
Subject: [PATCH 4/5] replace old ConnectedComponents with newer version

---
 recordlinkage/network.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/recordlinkage/network.py b/recordlinkage/network.py
index 79db769c..0671e477 100755
--- a/recordlinkage/network.py
+++ b/recordlinkage/network.py
@@ -379,19 +379,13 @@ def compute(self, links):
 
         """
 
-        try:
-            import networkx as nx
-        except ImportError():
-            raise Exception("'networkx' module is needed for this operation")
-
-        graph_pairs = nx.Graph()
-        graph_pairs.add_edges_from(links.values)
-        connected_pairs = (
-            graph_pairs.subgraph(c).copy() for c in nx.connected_components(graph_pairs)
-        )
+        G = nx.Graph()
+        G.add_edges_from(links.values)
+        connected_components = nx.connected_component_subgraphs(G)
 
         links_result = [
-            pd.MultiIndex.from_tuples(subgraph.edges()) for subgraph in connected_pairs
+            pd.MultiIndex.from_tuples(subgraph.edges())
+            for subgraph in connected_components
         ]
 
         return links_result

From 470985fc1105828cddd56c799776bc0538a079ee Mon Sep 17 00:00:00 2001
From: Johannes Weytjens <johannes.weytjens@telenet.be>
Date: Wed, 27 Nov 2019 17:12:43 +0100
Subject: [PATCH 5/5] actually change setup and tox

---
 setup.py | 1 +
 tox.ini  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 setup.py
 mode change 100644 => 100755 tox.ini

diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 45879d9c..5a80e9f7
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,7 @@ def read(fname):
     install_requires=[
         "six>=1.10.0",
         "jellyfish>=0.5.4",
+        "networkx>=2.0",
         "numpy>=1.13.0",
         "pandas>=0.18.0",
         "scipy>=0.17.1",
diff --git a/tox.ini b/tox.ini
old mode 100644
new mode 100755
index b9dafbdd..86481adb
--- a/tox.ini
+++ b/tox.ini
@@ -14,7 +14,6 @@ deps=
     pandaslatest: pandas
 commands=
     pip install -e .
-    pip install networkx
     pytest --cov-config .coveragerc --cov=recordlinkage --cov-append --cov-report=xml 
 
 [travis:env]