In [35]:
import pandas as pd
import re
import Levenshtein
import jellyfish
from Bio import pairwise2
import time

# Load datasets
acm = pd.read_csv("ACM.csv")
dblp = pd.read_csv("DBLP2.csv", encoding="latin1") # breaks without this
perfect_mapping = pd.read_csv("DBLP-ACM_perfectMapping.csv")

In [41]:
def preprocess(text):
    # (b) lowercase
    text = text.lower()
    # (c) convert multiple spaces to one
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for df in [acm, dblp]:
    for col in ['title', 'authors', 'venue', 'year']:
        df[col] = df[col].astype(str).apply(preprocess)

# (d): Levenshtein similarity
def title_similarity(s1, s2):
    if not s1 or not s2:
        return 0
    med = Levenshtein.distance(s1, s2)
    return 1 - med / max(len(s1), len(s2))

# (e): Jaro similarity 
def authors_similarity(a1, a2):
    if not a1 or not a2:
        return 0
    return jellyfish.jaro_similarity(a1, a2)

# Part (f): Modified affine similiratity, from the lab answers

def aff_sim(s1, s2, open_gap=1, gap_ext=0.1):
    # Handle empty strings
    if not s1 or not s2:
        return 0.0

    # Global alignment with affine gap penalties
    score = pairwise2.align.globalms(
        s1, s2,
        1,   # match score
        0,   # mismatch score
        -open_gap,   # gap open penalty
        -gap_ext,    # gap extension penalty
        score_only=True
    )

    # Maximum possible score = min(len(s1), len(s2)) * match_score
    max_score = min(len(s1), len(s2)) * 1.0

    return score / max_score if max_score > 0 else 0.0

# (g): Year match
def year_match(y1, y2):
    return 1 if y1 == y2 else 0

# (h): Weighted similarity
# Weights
w_t, w_a, w_c, w_y = 0.4, 0.3, 0.2, 0.1

def record_similarity(rec1, rec2):
    s_t = title_similarity(rec1["title"], rec2["title"])
    s_a = authors_similarity(rec1["authors"], rec2["authors"])
    s_c = aff_sim(rec1["venue"], rec2["venue"])
    s_y = year_match(rec1["year"], rec2["year"])
    return w_t * s_t + w_a * s_a + w_c * s_c + w_y * s_y


In [47]:
start = time.time()
# (i): Duplicates
duplicate_pairs = []

for i, rec1 in acm.iterrows():
    for j, rec2 in dblp.iterrows():
        sim = record_similarity(rec1, rec2)
        if sim > 0.7:
            duplicate_pairs.append((rec1["id"], rec2["id"], sim))


end = time.time()
print(end-start)
duplicates_df = pd.DataFrame(duplicate_pairs, columns=["acm_id", "dblp_id", "score"])

KeyboardInterrupt: 

In [57]:
perfect_set = set(zip(perfect_mapping["idACM"], perfect_mapping["idDBLP"]))
reported_set = set(zip(duplicates_df["acm_id"], duplicates_df["dblp_id"]))

correct_matches = reported_set & perfect_set
precision = len(correct_matches) / len(perfect_mapping)

print(f"Precision: {precision:.4f}")
print(f"Total reported duplicates: {len(reported_set)}")
print(f"Correct duplicates found: {len(correct_matches)}")
print(f"Total correct pairs that we tried to find: {len(perfect_mapping)}")

Precision: 0.7612
Total reported duplicates: 1992
Correct duplicates found: 1693
Total correct pairs that we tried to find 2224


In [53]:
perfect_mapping

Unnamed: 0,idDBLP,idACM
0,conf/sigmod/SlivinskasJS01,375678
1,conf/sigmod/ChaudhuriDN01,375694
2,conf/sigmod/RinfretOO01,375669
3,conf/sigmod/BreunigKKS01,375672
4,conf/sigmod/JagadishJOT01,375687
...,...,...
2219,journals/sigmod/Scholl01,604275
2220,journals/sigmod/Rosneblatt94,190649
2221,journals/sigmod/Winslett02b,601871
2222,journals/sigmod/Labrinidis01,604283
