Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Max weighted matching #107

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
146 changes: 133 additions & 13 deletions recordlinkage/network.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pandas as pd
import networkx as nx
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved

from recordlinkage.types import is_pandas_2d_multiindex
from recordlinkage.types import is_pandas_multiindex
from recordlinkage.types import is_pandas_like


class OneToOneLinking(object):
Expand All @@ -16,8 +18,7 @@ class OneToOneLinking(object):
Parameters
----------
method : str
The method to solve the problem. Only 'greedy' is supported at
the moment.
The method to solve the problem. The options are 'greedy' and 'max_weighted'.
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved

Note
----
Expand All @@ -31,6 +32,21 @@ def __init__(self, method='greedy'):

self.method = method

def _add_similarity_weights(self, links, data):
"""Add the similarity weights to the MultiIndex with candidate links."""

# calculate the total weight and remove all other columns
initial_columns = data.columns
data["weight"] = data.sum(axis=1)

# slicing on a multiindex is equivalent to merging on two columns
data = data.drop(columns=initial_columns).reset_index()
links = links.to_frame(index=False).rename(columns={0: "level_0", 1: "level_1"})
links = links.merge(data, how="left", on=["level_0", "level_1"]).set_index(["level_0", "level_1"])
links.index.names = [None, None]

return links

@classmethod
def _bool_duplicated(cls, links, level):

Expand All @@ -50,27 +66,136 @@ def _compute_greedy(self, links):

return pd.MultiIndex.from_tuples(result)

def _compute(self, links):
def _compute_max_weighted(self, links, data):
"""Compute a one to one linking by maximizing the total similarity weight."""

links = self._add_similarity_weights(links, data)
graph = self._to_weighted_bipartite_graph(links)

max_weighted_graph = self._max_weighted_graph(graph)
max_weighted_dataframe = self._to_max_weighted_dataframe(max_weighted_graph)

return max_weighted_dataframe

def _max_weighted_graph(self, graph):
"""Calculate the maximally weighted bipartite graph."""

# max weight matching
max_weighted_edges = nx.algorithms.matching.max_weight_matching(graph)

# restore order after matching
max_weighted_edges = self._order_max_weighted_bipartite_graph(graph, max_weighted_edges)

# create maximally weighted graph
weights = [graph[u][v]["weight"] for u, v in max_weighted_edges]
max_weighted_left = [edge[0] for edge in max_weighted_edges]
max_weighted_right = [edge[1] for edge in max_weighted_edges]

max_weighted_graph = nx.Graph()

max_weighted_graph.add_nodes_from(max_weighted_left, bipartite=0)
max_weighted_graph.add_nodes_from(max_weighted_right, bipartite=1)
max_weighted_graph.add_weighted_edges_from(
list(zip(max_weighted_left, max_weighted_right, weights))
)

return max_weighted_graph

def _compute(self, links, data):
if not is_pandas_2d_multiindex(links):
if not is_pandas_multiindex(links):
raise TypeError("expected pandas.MultiIndex")
elif not is_pandas_2d_multiindex(links):
raise ValueError(
"pandas.MultiIndex has incorrect number of "
"levels (expected 2 levels)")
if (data is not None) and (not is_pandas_like(data)):
raise TypeError("expected pandas.DataFrame")

if self.method == 'greedy':
if self.method == "greedy":
return self._compute_greedy(links)
elif self.method == "max_weighted":
return self._compute_max_weighted(links, data)
else:
raise ValueError("unknown matching method {}".format(self.method))

def compute(self, links):
def _order_max_weighted_bipartite_graph(self, graph, max_weighted_edges):
"""Swaps the order of edges that are swapped after max. weight matching."""

edges_left = list(set(edge[0] for edge in graph.edges))

max_weighted_left = [edge[0] for edge in max_weighted_edges]
max_weighted_right = [edge[1] for edge in max_weighted_edges]

for i, value in enumerate(max_weighted_left):
if value not in edges_left:
max_weighted_left[i], max_weighted_right[i] = (
max_weighted_right[i],
max_weighted_left[i],
)

ordered_max_weighted_edges = list(zip(max_weighted_left, max_weighted_right))

return ordered_max_weighted_edges

def _to_weighted_bipartite_graph(self, links):
"""Convert a pandas DataFrame with MultiIndex and single column weight to a bipartite graph with weighted edges."""

# add labels to both multiindex levels to ensure no overlap of nodes in the graph
links = links.set_index(self._add_node_labels_to_multiindex(links.index))
links = links.reset_index()

# create the graph
graph = nx.Graph()

graph.add_nodes_from(links["level_0"], bipartite=0)
graph.add_nodes_from(links["level_1"], bipartite=1)

graph.add_weighted_edges_from(
list(zip(links["level_0"], links["level_1"], links["weight"]))
)

return graph

def _to_max_weighted_dataframe(self, graph):
"""Convert a (max weighted) bipartite graph to a DataFrame."""

max_weighted_dataframe = nx.to_pandas_edgelist(graph)

# ensure output format is the same as the format of the initial candidate links
max_weighted_dataframe = max_weighted_dataframe.set_index(["source", "target"])
max_weighted_dataframe.index.names = [None, None]
max_weighted_dataframe = max_weighted_dataframe.set_index(
self._remove_node_labels_from_multiindex(max_weighted_dataframe.index)
).sort_index(level=0)

return max_weighted_dataframe

def _add_node_labels_to_multiindex(self, multiindex, labels=["left_", "right_"]):

for i, (level, dataset) in enumerate(zip(multiindex.levels, labels)):
stringified_level = [dataset + str(value) for value in level]
multiindex = multiindex.set_levels(stringified_level, i)

return multiindex

def _remove_node_labels_from_multiindex(self, multiindex, labels=["left_", "right_"]):

for i, (level, label) in enumerate(zip(multiindex.levels, labels)):
destringified_level = [int(value.replace(label, "")) for value in level]
multiindex = multiindex.set_levels(destringified_level, i)

return multiindex

def compute(self, links, data=None):
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved
"""Compute the one-to-one linking.

Parameters
----------
links : pandas.MultiIndex
The pairs to apply linking to.
data : pandas.DataFrame
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved
The similary weights computed for the entire dataset.

Returns
-------
Expand All @@ -79,7 +204,7 @@ def compute(self, links):

"""

return self._compute(links)
return self._compute(links, data)


class OneToManyLinking(OneToOneLinking):
Expand Down Expand Up @@ -132,7 +257,7 @@ def _compute_greedy(self, links):
source_dupl_bool = self._bool_duplicated(links, self.level)
return links[~source_dupl_bool]

def compute(self, links):
def compute(self, links, data=None):
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved
"""Compute the one-to-many matching.

Parameters
Expand All @@ -147,7 +272,7 @@ def compute(self, links):

"""

return self._compute(links)
return self._compute(links, data)


class ConnectedComponents(object):
Expand Down Expand Up @@ -181,11 +306,6 @@ def compute(self, links):

"""

try:
import networkx as nx
except ImportError():
raise Exception("'networkx' module is needed for this operation")

G = nx.Graph()
G.add_edges_from(links.values)
connected_components = nx.connected_component_subgraphs(G)
Expand Down
18 changes: 17 additions & 1 deletion tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
ConnectedComponents)


def test_one_to_one_linking():
def test_one_to_one_linking_greedy():

sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),
(4, 4), (5, 5), (6, 5), (7, 7), (7, 7),
Expand All @@ -23,6 +23,22 @@ def test_one_to_one_linking():
ptm.assert_index_equal(sample_one_to_many, expected)


def test_one_to_one_linking_max_weighted():

sample_index = pd.MultiIndex.from_tuples([(1, 1), (2, 1), (2, 2), (2, 3), (3, 3)])
sample_data = {"c1": [0, 1, 0, 1, 0], "c2": [1, 1, 1, 1, 1], "c3": [1, 1, 1, 1, 1]}
sample = pd.DataFrame(data=sample_data, index=sample_index)

one_to_one = OneToOneLinking(method="max_weighted")
sample_one_to_one = one_to_one.compute(sample_index, sample)

expected_index = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)])
expected_data = {"weight": [2, 2, 2]}
expected = pd.DataFrame(data=expected_data, index=expected_index)

ptm.assert_frame_equal(sample_one_to_one, expected)


def test_one_to_many_linking():

sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),
Expand Down