In [156]:
import pandas as pd
import re
import time
import numpy as np
from itertools import combinations
import os
from scripts.LSH import (shingle,
                         build_vocab,
                         one_hot,
                         get_minhash_arr,
                         get_signature,
                         compute_signature_similarity
                         )
from scripts.LSH import LSH

# Load datasets
acm = pd.read_csv("datasets/ACM.csv")
dblp = pd.read_csv("datasets/DBLP2.csv", encoding="latin1") # breaks without this
perfect_mapping = pd.read_csv("datasets/DBLP-ACM_perfectMapping.csv")

k = 5
b = 10
num_hashes = 100


In [157]:
# preprocessing
def preprocess_record(row):
    text = " ".join(str(x) for x in row.values if pd.notnull(x))
    text = re.sub(r"\s+", " ", text.lower().strip()) # substitute new lines w/ spaces etc
    return text

# get streamlined
dblp["joined"] = dblp.drop(columns='id').apply(preprocess_record, axis=1)
acm["joined"] = acm.drop(columns='id').apply(preprocess_record, axis=1)

# create list as in class
strings = dblp["joined"].tolist()
strings.extend(acm["joined"].tolist())


In [158]:
# Shingle Time
shingle_sets = [shingle(s, k) for s in strings]
vocab = build_vocab(shingle_sets)
vectors = [one_hot(sset, vocab) for sset in shingle_sets]

In [159]:
# get_signature
minhash_arr = get_minhash_arr(num_hashes, vocab)
signatures = [get_signature(minhash_arr, v) for v in vectors]

In [160]:
sim = compute_signature_similarity(signatures[0], signatures[1])
print("Signature similarity between doc0 and doc1:", sim)

Signature similarity between doc0 and doc1: 0.04


In [161]:
lsh = LSH(2)
for signature in signatures:
    lsh.add_hash(signature)
candidate_pairs = lsh.check_candidates()

if not candidate_pairs:
    raise ValueError('No candidate pairs')

In [162]:
# Now candidate pairs are cross join of all elements
# we need only dplm and acm

n_dblp = len(dblp)
n_acm = len(acm)

cross_pairs = []
for (i, j) in candidate_pairs:
    if i < n_dblp and j >= n_dblp:# i = DBLP, j = ACM
        cross_pairs.append((i, j - n_dblp))
    elif j < n_dblp and i >= n_dblp:# j = DBLP, i = ACM
        cross_pairs.append((j, i - n_dblp))

print(f'How Many cross pairs do we have between DBLP and ACM? {len(cross_pairs)}')

How Many cross pairs do we have between DBLP and ACM? 11


In [None]:
dblp_ids = dblp["id"].tolist()
acm_ids = acm["id"].tolist()
#remapped
mapped_candidates = []
for db_idx, ac_idx in cross_pairs: 
    db_id = dblp_ids[db_idx]
    ac_id = acm_ids[ac_idx]
    mapped_candidates.append((db_id, ac_id))


In [164]:
scored = []
for db_idx, ac_idx in cross_pairs:
    sim = compute_signature_similarity(
        signatures[db_idx],
        signatures[n_dblp + ac_idx]   # ACM index is offset
    )
    scored.append((db_idx, ac_idx, sim))

# sort by score, descending
scored.sort(key=lambda x: x[2], reverse=True)

# take top 2224
topk = scored[:2224]


In [166]:
truth = set(map(tuple, perfect_mapping[["idDBLP", "idACM"]].values))

retrieved = len(topk)
correct = 0
for db_idx, ac_idx, _ in topk:
    if (dblp_ids[db_idx], acm_ids[ac_idx]) in truth:
        correct += 1

precision = correct / retrieved if retrieved else 0
print(f"Precision: {precision:.4f} ({correct}/{retrieved})")


Precision: 1.0000 (11/11)
