In [1]:
import numpy as np
import pandas as pd

In [2]:
df_ACM = pd.read_csv("ACM.csv")
df_DB = pd.read_csv("DBLP2.csv", encoding="latin1") 

In [3]:
# 1
# Concatenate all columns per row into one string
df_DB["merged"] = df_DB.astype(str).agg(" ".join, axis=1)
df_ACM["merged"] = df_ACM.astype(str).agg(" ".join, axis =1)

In [4]:
# 2
df_DB["merged"] = df_DB["merged"].str.lower()
df_ACM["merged"] = df_ACM["merged"].str.lower()

In [5]:
#3
df_DB["merged"] = df_DB["merged"].str.replace(r"\s+", " ", regex=True).str.strip()
df_ACM["merged"] = df_ACM["merged"].str.replace(r"\s+", " ", regex=True).str.strip()


In [6]:
#4
list_ACM = df_ACM["merged"]
list_DB  = df_DB["merged"]

big_list = list_ACM + list_DB

In [7]:
#5 Shingle used from the lap
def shingle(text, k: int) -> set:
    """Return set of k-shingles, robust to None/NaN/non-strings/short strings."""
    # Treat None/NaN as empty
    if text is None:
        return set()
    if isinstance(text, float):
        if text != text:  # NaN
            return set()
        text = str(text)
    elif not isinstance(text, str):
        text = str(text)

    if len(text) < k:
        return set()
    return { text[i:i+k] for i in range(len(text) - k + 1) }

def build_vocab(shingle_sets: list) -> dict:
    full_set = {sh for s in shingle_sets for sh in s}
    return {sh: i for i, sh in enumerate(full_set)}

def one_hot(shingles: set, vocab: dict):
    vec = np.zeros(len(vocab), dtype=int)
    for sh in shingles:
        vec[vocab[sh]] = 1
    return vec

def get_minhash_arr(num_hashes:int, vocab:dict):
    length = len(vocab)
    arr = np.zeros((num_hashes, length), dtype=int)
    for i in range(num_hashes):
        arr[i, :] = np.random.permutation(length) + 1
    return arr

def get_signature(minhash: np.ndarray, vector: np.ndarray):
    idx = np.nonzero(vector)[0]
    if idx.size == 0:
        # No shingles; return a signature that won't match others
        return np.full(minhash.shape[0], np.iinfo(np.int32).max, dtype=int)
    return np.min(minhash[:, idx], axis=1)

def jaccard_similarity(set1: set, set2: set) -> float:
    inter = len(set1 & set2)
    union = len(set1 | set2)
    return inter / union if union else 0.0

def compute_signature_similarity(sig1: np.ndarray, sig2: np.ndarray) -> float:
    if sig1.shape != sig2.shape:
        raise ValueError("Signature shapes must match.")
    return float(np.mean(sig1 == sig2))

# Shingling
k = 3
shingle_sets = [shingle(doc, k) for doc in big_list]

# Vocab & one-hot
vocab = build_vocab(shingle_sets)
if len(vocab) == 0:
    raise ValueError("Vocabulary is empty. Check that big_list has strings of length >= k.")

onehot = np.stack([one_hot(sset, vocab) for sset in shingle_sets])

# MinHash signatures
num_hashes = 100
minhash_arr = get_minhash_arr(num_hashes, vocab)
signatures = np.stack([get_signature(minhash_arr, vec) for vec in onehot])

#  similarities

N = len(big_list)
jac_mat = np.eye(N, dtype=float)  # exact Jaccard similarity matrix
mh_mat  = np.eye(N, dtype=float)  # MinHash similarity matrix

for i in range(N):
    for j in range(i + 1, N):
        # Exact Jaccard on shingles
        s_jac = jaccard_similarity(shingle_sets[i], shingle_sets[j])
        jac_mat[i, j] = jac_mat[j, i] = s_jac

        # MinHash-based similarity (fraction of equal signature components)
        s_mh = compute_signature_similarity(signatures[i], signatures[j])
        mh_mat[i, j] = mh_mat[j, i] = s_mh


In [8]:
# print shingles
print("\n=== Example shingles (first 3 documents) ===")
for i, s in enumerate(shingle_sets[:3]):
    print(f"Doc {i}: {sorted(list(s))[:10]} ... ({len(s)} total shingles)")

# Print MinHash signatures
print("\n=== Example MinHash signatures (first 3 documents) ===")
for i, sig in enumerate(signatures[:3]):
    print(f"Doc {i} signature: {sig[:10]} ... ({len(sig)} total hash values)")

# Print Jaccard & MinHash similarities
print("\n=== Jaccard similarity matrix (rounded) ===")
print(np.round(jac_mat, 3))

print("\n=== MinHash similarity matrix (rounded) ===")
print(np.round(mh_mat, 3))



=== Example shingles (first 3 documents) ===
Doc 0: [' 19', ' an', ' ap', ' co', ' d.', ' da', ' de', ' en', ' fo', ' gl'] ... (249 total shingles)
Doc 1: [' 19', ' a ', ' an', ' ap', ' ba', ' co', ' cr', ' da', ' di', ' e.'] ... (258 total shingles)
Doc 2: [' 19', ' 20', ' ag', ' an', ' at', ' be', ' bo', ' co', ' da', ' fo'] ... (323 total shingles)

=== Example MinHash signatures (first 3 documents) ===
Doc 0 signature: [ 13  70  37  14 125  45  36  12 174   2] ... (100 total hash values)
Doc 1 signature: [132  32   7  52  25  14  36  12  16  14] ... (100 total hash values)
Doc 2 signature: [54 32  2 63 25 14 36 12 16 56] ... (100 total hash values)

=== Jaccard similarity matrix (rounded) ===
[[1.    0.176 0.182 ... 0.    0.    0.   ]
 [0.176 1.    0.218 ... 0.    0.    0.   ]
 [0.182 0.218 1.    ... 0.    0.    0.   ]
 ...
 [0.    0.    0.    ... 1.    0.    0.   ]
 [0.    0.    0.    ... 0.    1.    0.   ]
 [0.    0.    0.    ... 0.    0.    1.   ]]

=== MinHash similarity matri