In [7]:
import numpy as np
import pandas as pd

In [8]:
df_ACM = pd.read_csv("ACM.csv")
df_DB = pd.read_csv("DBLP2.csv", encoding="latin1") 

In [9]:
# 1
# Concatenate all columns per row into one string
df_DB["merged"] = df_DB.astype(str).agg(" ".join, axis=1)
df_ACM["merged"] = df_ACM.astype(str).agg(" ".join, axis =1)

In [10]:
# 2
df_DB["merged"] = df_DB["merged"].str.lower()
df_ACM["merged"] = df_ACM["merged"].str.lower()

In [11]:
#3
df_DB["merged"] = df_DB["merged"].str.replace(r"\s+", " ", regex=True).str.strip()
df_ACM["merged"] = df_ACM["merged"].str.replace(r"\s+", " ", regex=True).str.strip()


In [None]:
#4
list_ACM = df_ACM["merged"]
list_DB  = df_DB["merged"]

big_list = list_ACM + list_DB

2616 records combined.
0    304586 the wasa2 object-oriented workflow mana...
1    304587 a user-centered interface for querying ...
2    304589 world wide database-integrating the web...
3    304590 xml-based information mediation with mi...
4    304582 the ccube constraint object-oriented da...
Name: merged, dtype: object


In [None]:
#5 Shingle used from the lap
def shingle(text, k: int) -> set:
    """Return set of k-shingles, robust to None/NaN/non-strings/short strings."""
    # Treat None/NaN as empty
    if text is None:
        return set()
    if isinstance(text, float):
        if text != text:  # NaN
            return set()
        text = str(text)
    elif not isinstance(text, str):
        text = str(text)

    if len(text) < k:
        return set()
    return { text[i:i+k] for i in range(len(text) - k + 1) }

def build_vocab(shingle_sets: list) -> dict:
    full_set = {sh for s in shingle_sets for sh in s}
    return {sh: i for i, sh in enumerate(full_set)}

def one_hot(shingles: set, vocab: dict):
    vec = np.zeros(len(vocab), dtype=int)
    for sh in shingles:
        vec[vocab[sh]] = 1
    return vec

def get_minhash_arr(num_hashes:int, vocab:dict):
    length = len(vocab)
    arr = np.zeros((num_hashes, length), dtype=int)
    for i in range(num_hashes):
        arr[i, :] = np.random.permutation(length) + 1
    return arr

def get_signature(minhash: np.ndarray, vector: np.ndarray):
    idx = np.nonzero(vector)[0]
    if idx.size == 0:
        # No shingles; return a signature that won't match others
        return np.full(minhash.shape[0], np.iinfo(np.int32).max, dtype=int)
    return np.min(minhash[:, idx], axis=1)

def jaccard_similarity(set1: set, set2: set) -> float:
    inter = len(set1 & set2)
    union = len(set1 | set2)
    return inter / union if union else 0.0

def compute_signature_similarity(sig1: np.ndarray, sig2: np.ndarray) -> float:
    if sig1.shape != sig2.shape:
        raise ValueError("Signature shapes must match.")
    return float(np.mean(sig1 == sig2))

# Shingling
k = 3
shingle_sets = [shingle(doc, k) for doc in big_list]

# Vocab & one-hot
vocab = build_vocab(shingle_sets)
if len(vocab) == 0:
    raise ValueError("Vocabulary is empty. Check that big_list has strings of length >= k.")

onehot = np.stack([one_hot(sset, vocab) for sset in shingle_sets])

# MinHash signatures
num_hashes = 100
minhash_arr = get_minhash_arr(num_hashes, vocab)
signatures = np.stack([get_signature(minhash_arr, vec) for vec in onehot])

#  similarities


Doc 0 vs Doc 1
  Jaccard (3-shingles): 0.1763
  MinHash estimate    : 0.2100

Pairwise similarities (first N docs):
    doc_a  doc_b   jaccard  minhash
0       0      1  0.176334     0.15
1       0      2  0.181818     0.18
2       0      3  0.169388     0.16
3       0      4  0.220588     0.22
4       0      5  0.171657     0.23
5       0      6  0.237164     0.28
6       0      7  0.197436     0.22
7       0      8  0.143868     0.15
8       0      9  0.170792     0.20
9       1      2  0.218029     0.17
10      1      3  0.220126     0.19
11      1      4  0.204276     0.22
12      1      5  0.148362     0.13
13      1      6  0.181193     0.14
14      1      7  0.192982     0.16
15      1      8  0.100223     0.11
16      1      9  0.105505     0.09
17      2      3  0.191529     0.16
18      2      4  0.209302     0.21
19      2      5  0.159649     0.14
