In [11]:
import numpy as np
import pandas as pd
import Levenshtein as lev
from py_stringmatching import similarity_measure as sm

In [12]:
# Part 1
# a. Ignore the pub_id.
acm = pd.read_csv("ACM.csv")
dblp = pd.read_csv("DBLP2.csv", encoding="latin1")
acm1 = pd.read_csv("ACM.csv")
dblp1 = pd.read_csv("DBLP2.csv", encoding="latin1")
# acm1 = acm.drop(columns=['id'], errors='ignore')
# dblp1 = dblp.drop(columns=['id'], errors = 'ignore')

In [13]:
# b. Change all alphabetical characters into lowercase.
acm1['title'] = acm1['title'].str.lower()
dblp1['title'] = dblp1['title'].str.lower()
print(acm1.head(10)) #to double-check correctness of action

       id                                              title  \
0  304586  the wasa2 object-oriented workflow management ...   
1  304587  a user-centered interface for querying distrib...   
2  304589  world wide database-integrating the web, corba...   
3  304590           xml-based information mediation with mix   
4  304582  the ccube constraint object-oriented database ...   
5  304583  the cornell jaguar project: adding mobility to...   
6  304584  the active multisync controller of the cubetre...   
7  304585                  the jungle database search engine   
8  306112  adept: an agent-based approach to business pro...   
9  306115  a componentized architecture for dynamic elect...   

                                             authors  \
0                    Gottfried Vossen, Mathias Weske   
1                  Isabel F. Cruz, Kimberly M. James   
2  Athman Bouguettaya, Boualem Benatallah, Lily H...   
3  Chaitan Baru, Amarnath Gupta, Bertram Lud&#228...   
4  Alexander Br

In [14]:
# c. Convert multiple spaces to one.
acm1['title'] = acm1['title'].str.replace(r'\s+', ' ', regex=True)
dblp1['title'] = dblp1['title'].str.replace(r'\s+', ' ', regex=True)

In [15]:
# d. Use Levenshtein similarity
def levenshtein_similarity(s1, s2):
    return 1 - lev.distance(s1, s2) / max(len(s1), len(s2))
st = levenshtein_similarity(acm1['title'], dblp1['title'])
st

0.013379204892966401

In [16]:
# e. Use Jaro similarity
def jaro_similarity(s1, s2):
    return lev.jaro_winkler(s1, s2)
sa = jaro_similarity(acm1['authors'], dblp1['authors'])
sa

0.3177864135767755

In [17]:
#. f NOT SURE ABOUT IT: Use a modified version of the affine similarity that is scaled to the interval [0, 1]
def scaled_affine_similarity(s1, s2, open_gap = 1, gap_ext = 0.1):
    affine_similarity = sm.affine.Affine(gap_start = 1, gap_continuation = 0.1,
                       sim_func = lambda s1, s2: (int(1 if s1 == s2 else 0)))
    return affine_similarity.get_raw_score(s1, s2) / min(len(s1), len(s2))
sv = scaled_affine_similarity(acm1.iloc[0]['venue'], dblp1.iloc[0]['venue'])
sv

-0.23076910238999587

In [18]:
#. g match/mismatch
def year_match(year1, year2):
    if year1 == year2:
        return 1
    else:
        return 0
sy = year_match(acm1.iloc[0]['year'], dblp1.iloc[0]['year'])
sy

1

In [19]:
# h. formula to combine scores to get final score
# h. formula to combine scores to get final score
def compare_records(rec1, rec2):
  w1 = 0.5
  w2 = 0.2
  w3 = 0.2
  w4 = 0.1

  st = levenshtein_similarity(rec1.title, rec2.title)
  sa = jaro_similarity(rec1.authors, rec2.authors)
  sc = scaled_affine_similarity(rec1.venue, rec2.venue)
  sy = year_match(rec1.year, rec2.year)

  rec_sim = w1 * sa + w2 * st + w3 * sc + w4 * sy
  return rec_sim

In [20]:
# i. Report the records with rec_sim > 0.7 as duplicate records by storing the ids of both records in a list.
def record_sim(rec1, rec2,sa, st, sv, sy, threshold=0.7):
    w1, w2, w3, w4 = 0.5, 0.2, 0.2, 0.1

    rec_sim = w1 * sa + w2 * st + w3 * sv + w4 * sy
    print("The similarity between the two records =", rec_sim)

    # Check if similarity exceeds threshold
    if rec_sim > threshold:
        return True
    else:
        return False

# Example call (compare first record in each dataset)
record_sim(acm1.iloc[0], dblp1.iloc[0],sa, st, sv, sy, 0.7)


The similarity between the two records = 0.21541522728898185


False

In [21]:
# i. Report the records with rec_sim > 0.7 as duplicate records by storing the ids of both records in a list.
def find_duplicate_ids(df1, df2, threshold = 0.7):
  duplicate_ids = pd.DataFrame(columns = ['idDBLP', 'idACM'])
  for i, rec1 in df1.iterrows():
    if(i % 100 == 0):
      print(i)
    for j, rec2 in df2.iterrows():
      if compare_records(rec1, rec2) > threshold:
        duplicate_ids.loc[len(duplicate_ids)] = [rec1.id, rec2.id]
  return duplicate_ids


In [None]:
import time

start_time = time.time()

duplicate_ids = find_duplicate_ids(dblp, acm)

matches = pd.read_csv("DBLP-ACM_perfectMapping.csv")
matches.idDBLP = matches.idDBLP.str.lower()

intersection = pd.merge(duplicate_ids, matches, on=['idDBLP', 'idACM'])
precision = len(intersection) / len(duplicate_ids)
end_time = time.time()

elapsed_time = end_time - start_time
print('elapsed time', round(elapsed_time), "s")

duplicate_ids
precision

0
100
200
300
400
