In [1]:
# https://dbs.uni-leipzig.de/research/projects/benchmark-datasets-for-entity-resolution

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
import re
import networkx as nx
from tqdm import tqdm
from typing import List
from itertools import product

warnings.filterwarnings(action="ignore")

In [2]:
class TextSim:
    def __init__(self, blacklist_tokens: List[str] = None, whitelist_tokens: List[str] = None, thresh: float = None):
        self.blacklist_tokens = blacklist_tokens
        self.whitelist_tokens = whitelist_tokens
        self.thresh = thresh
    
    def jaccard_token_similarity(self, string1: str , string2: str) -> int:
        if self.whitelist_tokens is not None:
            tok1 = [t for t in string1.split(" ") if t in self.whitelist_tokens]
            tok2 = [t for t in string2.split(" ") if t in self.whitelist_tokens]
        elif self.blacklist_tokens is not None:
            tok1 = [t for t in string1.split(" ") if t not in self.blacklist_tokens]
            tok2 = [t for t in string2.split(" ") if t not in self.blacklist_tokens]
        else:
            tok1 = string1.split(" ")
            tok2 = string2.split(" ")

        tok1 = list(set(tok1))
        tok2 = list(set(tok2))
        common = list(set(tok1).intersection(set(tok2)))

        if (len(tok1) == 0) and (len(tok2) == 0):
            return 0
        sim = len(common) / (len(tok1) + len(tok2))
        if self.thresh is not None:
            if sim < self.thresh:
                return 0. 
        return sim


class SimNet:
    def __init__(self, nodes: List[str], sim_func):
        # build graph; node similarities = edge weights
        g = nx.Graph()
        sim_mat = np.zeros((len(nodes), len(nodes)))
        for i in tqdm(range(len(nodes)), desc="edge calculation"):
            for j in range(i + 1, len(nodes)):
                sim = sim_func(nodes[i], nodes[j])
                node_i = nodes[i]
                node_j = nodes[j]
                sim_mat[i][j] = sim
                sim_mat[j][i] = sim
                if sim > 0:
                    g.add_edge(node_i, node_j, weight=sim)
                    g.add_edge(node_j, node_i, weight=sim)
                else:
                    g.add_node(node_i)
                    g.add_node(node_j)
        self.g = g
        self.cliques = list(nx.find_cliques(g))
        self.sim_mat = sim_mat

    def sim_cn_jaccard(self, node_i: str, node_j: str, weighted: bool) -> float:
        # individual neighbors
        neighbors_i = set(self.g.neighbors(node_i))
        neighbors_j = set(self.g.neighbors(node_j))
        union = neighbors_i.union(neighbors_j)
        intersection = neighbors_i.intersection(neighbors_j)
        if weighted:
            weights_total_i = sum([self.g.get_edge_data(node_i, n)["weight"] for n in neighbors_i])
            weights_total_j = sum([self.g.get_edge_data(node_j, n)["weight"] for n in neighbors_j])
            weights_total = weights_total_i + weights_total_j
        else:
            weights_total = len(union)
        # common neighbors
        if weighted:
            weights_common = 0
            for neighbor in intersection:
                # Sum of edge weights from node_i and node_j to the common neighbor
                weight_i = self.g[node_i][neighbor]['weight']
                weight_j = self.g[node_j][neighbor]['weight']
                weights_common += weight_i + weight_j
        else:
            weights_common = len(intersection)
        if weights_total == 0:
            return 0
        return weights_common / weights_total
    
    def sim_clique_jaccard(self, node_i: str, node_j: str) -> float:
        cliques_i = set([i for i, c in enumerate(self.cliques) if node_i in c])
        cliques_j = set([i for i, c in enumerate(self.cliques) if node_j in c])
        union = cliques_i.union(cliques_j)
        intersection = cliques_i.intersection(cliques_j)
        if len(union) == 0:
            return 0
        return len(intersection) / len(union)
    
    def sim_preferential_attachment(self, node_i: str, node_j: str) -> float:
        return self.g.degree(node_i) * self.g.degree(node_j)
    
    def shortest_path(self, node_i: str, node_j: str) -> int:
        try:
            shortest_path = nx.shortest_path(self.g, source=node_i, target=node_j)
            return len(shortest_path)
        except nx.NetworkXNoPath:
            return None
        
    def sim(self, node_i: str, node_j: str) -> float:
        try:
            return self.g[node_i][node_j]['weight']
        except: 
            return 0
        

class SimNetClassifer:
    """
    pd DataFrame with node_name, label, block_id
    """
    def __init__(self, sim_func = None):
        if sim_func == None:
            sim_func = TextSim(thresh=0.1, blacklist_tokens=["", " "]).jaccard_token_similarity
        self.sim_func = sim_func
        self.cls = None

    def fit(self, df_train_eval: pd.DataFrame):
        sim_net = SimNet(nodes=df_train_eval["node_name"], sim_func=self.sim_func)
        data_train_val = self._build_data(sim_net=sim_net, df=df_train_eval)
        x = data_train_val[["sim_clique", "sim_cn", "pa", "shortest_path", "sim"]]
        y = data_train_val["label"]
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
        cls = RandomForestClassifier().fit(X=x_train, y=y_train.values.astype(float))
        pred = cls.predict(x_val)
        print("val acc", accuracy_score(y_pred=pred, y_true=y_val.values.astype(float)))
        print("val precision", precision_score(y_pred=pred, y_true=y_val.values.astype(float)))
        print("val recall", recall_score(y_pred=pred, y_true=y_val.values.astype(float)))
        self.cls = cls
    
    def infer(self, df_train_val: pd.DataFrame, df_infer: pd.DataFrame):
        df_all = pd.concat([df_train_val, df_infer], ignore_index=True)
        sim_net = SimNet(nodes=df_all["node_name"], sim_func=self.sim_func)
        data_infer = self._build_data(sim_net=sim_net, df_from=df_infer, df_to=df_all, with_label=False)
        x = data_infer[["sim_clique", "sim_cn", "pa", "shortest_path", "sim"]]
        data_infer["pred"] = self.cls.predict(x)
        return data_infer

        

    def _build_data(self, sim_net, df: pd.DataFrame = None, df_from: pd.DataFrame = None, df_to: pd.DataFrame = None, with_label: bool = True):
        if df is not None:
            df_from = df
            df_to = df
        data = []
        gb_from = df_from.groupby("block_id")
        gb_to = df_to.groupby("block_id")
        for block_id in tqdm(range(len(df_from["block_id"].unique())), desc="building dataset"):
            block = df_from["block_id"].unique()[block_id]
            nodes_from = gb_from.get_group(block)["node_name"].values
            nodes_to = gb_to.get_group(block)["node_name"].values
            combinations = list(product(nodes_from, nodes_to))
            combinations = [c for c in combinations if c[0] != c[1]]
            combinations_temp = []
            for c in combinations:
                c = tuple(sorted(c))
                if c not in combinations_temp:
                    combinations_temp.append(c)
            combinations = combinations_temp
            if with_label:
                labels_from = gb_from.get_group(block)["label"].values
                labels_to = gb_from.get_group(block)["label"].values
                labels = [l for l in labels_from] + [l for l in labels_to]
                nodes = [n for n in nodes_from] + [n for n in nodes_to]
                labels_map = {}
                for i, node in enumerate(nodes):
                    if node not in labels_map.keys():
                        labels_map[node] = labels[i]
                combinations_labels = [1 if labels_map[c[0]] == labels_map[c[1]] else 0 for c in combinations]
            for i, c in enumerate(combinations):
                data_inner = []
                data_inner.append(sim_net.sim_clique_jaccard(c[0], c[1]))
                data_inner.append(sim_net.sim_cn_jaccard(c[0], c[1], weighted=False))
                data_inner.append(sim_net.sim_preferential_attachment(c[0], c[1]))
                data_inner.append(sim_net.shortest_path(c[0], c[1]))
                data_inner.append(sim_net.sim(c[0], c[1]))
                data_inner.append((c[0], c[1]))
                if with_label:
                    data_inner.append(combinations_labels[i])
                data.append(data_inner)
        # fillna & scale features
        cols = ["sim_clique", "sim_cn", "pa", "shortest_path", "sim", "nodes"]
        if with_label:
            cols.append("label")
        df_data = pd.DataFrame(data, columns=cols)
        df_data["shortest_path"].fillna(df_data["shortest_path"].max() + 1, inplace=True)
        scaler = MinMaxScaler()
        for col in ["pa", "shortest_path"]:
            df_data[col] = scaler.fit_transform(df_data[col].values.reshape(-1, 1))
        return df_data

In [3]:
"""
import requests
from io import BytesIO  

df = pd.read_csv(BytesIO(res.content))
df.to_csv("musicbrainz.csv", index=False)
url = "https://raw.githubusercontent.com/tomonori-masui/entity-resolution/main/data/musicbrainz_200k.csv"
res = requests.get(url)
"""


def clean_year(year):
    year = str(year)
    if year in ["n.a.", "unk."]:
        return "unknown"
    for j in range(50, 100):
        if (year == f"'{j}") or (year == f"{j}"):
            return f"19{j}"
    for i in range(25):
        if i < 10:
            if year == str(i):
                return f"200{i}"
            i = f"0{i}"
        if (year == f"'{i}") or (year == f"{i}") or (year == f"{i}"):
            return f"20{i}"
    return year


# load and clean data, split into data available at training time and data to infer labels on
df = pd.read_csv("musicbrainz.csv")
df = df[df["language"].isin(["German", "ger.", "Ger.", "german", "GERMAN"])].drop_duplicates("title").sort_values("CID")
df = df[df["year"].notna()]
df["year"] = df["year"].apply(clean_year)
df["year"].value_counts().head(50)
df = df[df["title"].notna()]
df["title"] = df["title"].apply(lambda x: x.replace("-", " "))
df["title"] = df["title"].apply(lambda x: x.replace("'", " "))
df["title"] = df["title"].apply(lambda x: x.replace('"', " "))
df["title"] = df["title"].apply(lambda x: re.sub(pattern="\d\d\d", repl="", string=x))
df = df.groupby("year").filter(lambda x: len(x) > 1)
df = df[["CID", "title", "year"]]
df["CID"] = LabelEncoder().fit_transform(df["CID"])
df.columns = ["label", "node_name", "block_id"]
df.index = range(len(df))
df_train_eval, df_inference = train_test_split(df, test_size=0.2)
df_train_eval.index = range(len(df_train_eval))
df_inference.index = range(len(df_inference))

In [4]:
smc = SimNetClassifer()
smc.fit(df_train_eval=df_train_eval)
preds = smc.infer(df_train_val=df_train_eval, df_infer=df_inference)

edge calculation: 100%|██████████| 2564/2564 [00:33<00:00, 76.20it/s] 
building dataset: 100%|██████████| 55/55 [01:42<00:00,  1.87s/it]


val acc 0.9988122660524043
val precision 0.8518518518518519
val recall 0.8984375


edge calculation: 100%|██████████| 3205/3205 [00:56<00:00, 56.83it/s] 
building dataset: 100%|██████████| 48/48 [01:49<00:00,  2.29s/it]


In [5]:
preds[preds["pred"] == 1]["nodes"].tolist()

[('Eduard Mörike   So ging es hin und her …',
  'So ging es hin und her …   Mozart auf der Reise nach Prag'),
 ('Cantata, BWV 9  Es ist das Heil uns kommen her : III. Aria (Tenore)  Wir waren schon zu tief gesunken    Bach Edition, IV: Cantatas II',
  'Wachet auf, ruft uns die Stimme, BWV : VI. Aria (Duetto: Soprano, Basso): Mein Freund ist mein!   Bach Edition, IV: Cantatas II'),
 ('Chapter 1 4: Der andere Minister   Harry Potter und der Halbblutprinz (feat. narrator: Rufus Beck)',
  'J.K. Rowling   Chapter 1 4: Der andere Minister'),
 ('Abschnitt 6', 'Schatzsuche 6'),
 ('Schatzsuche 6', 'WAS IST WAS   Schatzsuche 6'),
 ('Hell   Sprung aus den Wolken (David Duriez Briquerouge mix)',
  'Sprung aus den Wolken (David Duriez Briquerouge mix)   Remixed'),
 ('Loikaemie   Wir sind die Skins', 'Wir sind die Skins'),
 ('Mit der letzten Straßenbahn',
  'Ulrich Tukur & Die Rhythmus Boys   Mit der letzten Straßenbahn'),
 ('L’ÂME IMMORTELLE   Der letzte Akt', 'letzte Akt   Auf Deinen Schwingen'),


In [7]:
# blocking by "contains same name"
"""
with open("names_german.txt", "r", encoding="utf8") as f:
    names = f.read().split("\n")
names_temp = []
for name in names:
    split = name.split(" ")
    if len(split) == 2:
        names_temp.extend(split)
names = list(set(names_temp))

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(vocabulary=names, use_idf=False, lowercase=False).fit(df["title"])
tfidf = pd.DataFrame(vec.transform(df["title"]).todense() > 0, columns=vec.get_feature_names_out())
tfidf = tfidf[tfidf.columns[tfidf.sum() > 1]]

names_contained = []
for i, row in tfidf.iterrows():
    names_contained.append([c for c in tfidf.columns[row.values]])
names_contained
"""

'\nwith open("names_german.txt", "r", encoding="utf8") as f:\n    names = f.read().split("\n")\nnames_temp = []\nfor name in names:\n    split = name.split(" ")\n    if len(split) == 2:\n        names_temp.extend(split)\nnames = list(set(names_temp))\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nvec = TfidfVectorizer(vocabulary=names, use_idf=False, lowercase=False).fit(df["title"])\ntfidf = pd.DataFrame(vec.transform(df["title"]).todense() > 0, columns=vec.get_feature_names_out())\ntfidf = tfidf[tfidf.columns[tfidf.sum() > 1]]\n\nnames_contained = []\nfor i, row in tfidf.iterrows():\n    names_contained.append([c for c in tfidf.columns[row.values]])\nnames_contained\n'