In [1]:
import math
import numpy as np
from collections import Counter

# --- Sample Documents ---
doc1 = "sistem informasi manajemen"
doc2 = "sistem basis data"
doc3 = "manajemen sistem informasi" # Mirip doc1
doc4 = "pengantar basis data dan informasi" # Shares terms with doc2 and others

documents = {
    "Doc1": doc1,
    "Doc2": doc2,
    "Doc3": doc3,
    "Doc4": doc4,
}

# --- Preprocessing ---

# 1. Tokenize and create sets for Jaccard/Dice
tokens = {name: set(doc.lower().split()) for name, doc in documents.items()}
# Example: tokens['Doc1'] = {'sistem', 'informasi', 'manajemen'}

# 2. Build Vocabulary and TF Vectors for Cosine/Manhattan
all_tokens = set()
for name in tokens:
    all_tokens.update(tokens[name])
vocabulary = sorted(list(all_tokens))
vocab_dict = {word: i for i, word in enumerate(vocabulary)}
# Example: vocabulary = ['basis', 'data', 'dan', 'informasi', 'manajemen', 'pengantar', 'sistem']

tf_vectors = {}
for name, doc in documents.items():
    vec = np.zeros(len(vocabulary))
    word_counts = Counter(doc.lower().split())
    for word, count in word_counts.items():
        if word in vocab_dict:
            vec[vocab_dict[word]] = count
    tf_vectors[name] = vec
# Example: tf_vectors['Doc1'] -> array([0., 0., 0., 1., 1., 0., 1.]) based on example vocab

# 3. Normalize TF vectors for Bhattacharyya (Probability Distributions)
prob_vectors = {}
for name, vec in tf_vectors.items():
    total_sum = vec.sum()
    if total_sum == 0:
        prob_vectors[name] = vec # Avoid division by zero, keep as zeros
    else:
        # Add a small epsilon if smoothing is needed, but not required for this demo
        prob_vectors[name] = vec / total_sum
# Example: prob_vectors['Doc1'] -> array([0.        , 0.        , 0.        , 0.33333333, 0.33333333, 0.        , 0.33333333])

# --- Similarity / Distance Functions ---

def jaccard_similarity(set1, set2):
    """Calculates Jaccard Similarity between two sets."""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 1.0 # Similarity of two empty sets is 1
    return intersection / union

def dice_coefficient(set1, set2):
    """Calculates Dice Coefficient between two sets."""
    intersection = len(set1.intersection(set2))
    len1 = len(set1)
    len2 = len(set2)
    denominator = len1 + len2
    if denominator == 0:
        return 1.0 # Similarity of two empty sets is 1
    return (2 * intersection) / denominator

def manhattan_distance(vec1, vec2):
    """Calculates Manhattan (L1) distance between two numerical vectors."""
    if len(vec1) != len(vec2):
        raise ValueError("Vectors must have the same dimension")
    return np.sum(np.abs(vec1 - vec2))

def cosine_similarity(vec1, vec2):
    """Calculates Cosine Similarity between two numerical vectors."""
    if len(vec1) != len(vec2):
        raise ValueError("Vectors must have the same dimension")
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 # Similarity with a zero vector is 0
    return dot_product / (norm_vec1 * norm_vec2)

def bhattacharyya_coefficient(pvec1, pvec2):
    """Calculates Bhattacharyya Coefficient between two probability distributions."""
    if len(pvec1) != len(pvec2):
        raise ValueError("Probability vectors must have the same dimension")
    # Ensure non-negativity if needed, though normalization should handle it
    # Add small epsilon to avoid sqrt(0*0) issues if strict non-negativity isn't guaranteed
    # epsilon = 1e-10
    # coefficient = np.sum(np.sqrt((pvec1 + epsilon) * (pvec2 + epsilon)))
    coefficient = np.sum(np.sqrt(pvec1 * pvec2))
    return coefficient

def bhattacharyya_distance(pvec1, pvec2):
    """Calculates Bhattacharyya Distance between two probability distributions."""
    bc = bhattacharyya_coefficient(pvec1, pvec2)
    # Clamp bc to avoid log(0) or issues with values slightly > 1 due to precision
    bc_clamped = max(0.0, min(1.0, bc))
    if bc_clamped == 0.0:
        return float('inf') # Distributions have no overlap
    # Add small epsilon inside log if bc can be exactly 0 and log(0) is problematic
    # return -math.log(bc_clamped + 1e-10)
    return -math.log(bc_clamped)


# --- Demonstration ---

print("--- Comparing Doc1 vs Doc2 ---")
print(f"Tokens Doc1: {tokens['Doc1']}")
print(f"Tokens Doc2: {tokens['Doc2']}")
print(f"Jaccard Similarity: {jaccard_similarity(tokens['Doc1'], tokens['Doc2']):.4f}")
print(f"Dice Coefficient:   {dice_coefficient(tokens['Doc1'], tokens['Doc2']):.4f}")
print("-" * 10)
print(f"TF Vector Doc1: {tf_vectors['Doc1']}")
print(f"TF Vector Doc2: {tf_vectors['Doc2']}")
print(f"Manhattan Distance: {manhattan_distance(tf_vectors['Doc1'], tf_vectors['Doc2']):.4f}")
print(f"Cosine Similarity:  {cosine_similarity(tf_vectors['Doc1'], tf_vectors['Doc2']):.4f}")
print("-" * 10)
print(f"Prob Vector Doc1: {prob_vectors['Doc1']}")
print(f"Prob Vector Doc2: {prob_vectors['Doc2']}")
print(f"Bhattacharyya Coeff: {bhattacharyya_coefficient(prob_vectors['Doc1'], prob_vectors['Doc2']):.4f}")
print(f"Bhattacharyya Dist:  {bhattacharyya_distance(prob_vectors['Doc1'], prob_vectors['Doc2']):.4f}")

print("\n--- Comparing Doc1 vs Doc3 (Similar) ---")
print(f"Tokens Doc1: {tokens['Doc1']}")
print(f"Tokens Doc3: {tokens['Doc3']}")
print(f"Jaccard Similarity: {jaccard_similarity(tokens['Doc1'], tokens['Doc3']):.4f}")
print(f"Dice Coefficient:   {dice_coefficient(tokens['Doc1'], tokens['Doc3']):.4f}")
print("-" * 10)
print(f"TF Vector Doc1: {tf_vectors['Doc1']}")
print(f"TF Vector Doc3: {tf_vectors['Doc3']}")
# TF vectors might be identical if word order doesn't matter for TF
print(f"Manhattan Distance: {manhattan_distance(tf_vectors['Doc1'], tf_vectors['Doc3']):.4f}")
print(f"Cosine Similarity:  {cosine_similarity(tf_vectors['Doc1'], tf_vectors['Doc3']):.4f}")
print("-" * 10)
print(f"Prob Vector Doc1: {prob_vectors['Doc1']}")
print(f"Prob Vector Doc3: {prob_vectors['Doc3']}")
print(f"Bhattacharyya Coeff: {bhattacharyya_coefficient(prob_vectors['Doc1'], prob_vectors['Doc3']):.4f}")
print(f"Bhattacharyya Dist:  {bhattacharyya_distance(prob_vectors['Doc1'], prob_vectors['Doc3']):.4f}")

print("\n--- Comparing Doc2 vs Doc4 ---")
print(f"Tokens Doc2: {tokens['Doc2']}")
print(f"Tokens Doc4: {tokens['Doc4']}")
print(f"Jaccard Similarity: {jaccard_similarity(tokens['Doc2'], tokens['Doc4']):.4f}")
print(f"Dice Coefficient:   {dice_coefficient(tokens['Doc2'], tokens['Doc4']):.4f}")
print("-" * 10)
print(f"TF Vector Doc2: {tf_vectors['Doc2']}")
print(f"TF Vector Doc4: {tf_vectors['Doc4']}")
print(f"Manhattan Distance: {manhattan_distance(tf_vectors['Doc2'], tf_vectors['Doc4']):.4f}")
print(f"Cosine Similarity:  {cosine_similarity(tf_vectors['Doc2'], tf_vectors['Doc4']):.4f}")
print("-" * 10)
print(f"Prob Vector Doc2: {prob_vectors['Doc2']}")
print(f"Prob Vector Doc4: {prob_vectors['Doc4']}")
print(f"Bhattacharyya Coeff: {bhattacharyya_coefficient(prob_vectors['Doc2'], prob_vectors['Doc4']):.4f}")
print(f"Bhattacharyya Dist:  {bhattacharyya_distance(prob_vectors['Doc2'], prob_vectors['Doc4']):.4f}")

--- Comparing Doc1 vs Doc2 ---
Tokens Doc1: {'informasi', 'manajemen', 'sistem'}
Tokens Doc2: {'basis', 'data', 'sistem'}
Jaccard Similarity: 0.2000
Dice Coefficient:   0.3333
----------
TF Vector Doc1: [0. 0. 0. 1. 1. 0. 1.]
TF Vector Doc2: [1. 0. 1. 0. 0. 0. 1.]
Manhattan Distance: 4.0000
Cosine Similarity:  0.3333
----------
Prob Vector Doc1: [0.         0.         0.         0.33333333 0.33333333 0.
 0.33333333]
Prob Vector Doc2: [0.33333333 0.         0.33333333 0.         0.         0.
 0.33333333]
Bhattacharyya Coeff: 0.3333
Bhattacharyya Dist:  1.0986

--- Comparing Doc1 vs Doc3 (Similar) ---
Tokens Doc1: {'informasi', 'manajemen', 'sistem'}
Tokens Doc3: {'informasi', 'manajemen', 'sistem'}
Jaccard Similarity: 1.0000
Dice Coefficient:   1.0000
----------
TF Vector Doc1: [0. 0. 0. 1. 1. 0. 1.]
TF Vector Doc3: [0. 0. 0. 1. 1. 0. 1.]
Manhattan Distance: 0.0000
Cosine Similarity:  1.0000
----------
Prob Vector Doc1: [0.         0.         0.         0.33333333 0.33333333 0.
 0.3333