In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import jaccard

In [2]:
# Define documents
documents = ["ant ant bee", "dog bee dog hog dog ant dog", "cat gnu dog eel fox"]

In [3]:
# Create a Count Vectorizer to transform the documents into vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents).toarray()

In [4]:
# Calculate cosine similarity
cos_sim = cosine_similarity(X)

In [5]:
# Calculate Jaccard distance
jaccard_distances = np.zeros((len(documents), len(documents)))
for i in range(len(documents)):
    for j in range(len(documents)):
        jaccard_distances[i, j] = jaccard(X[i], X[j])

In [6]:
# Calculate Euclidean distance
euclidean_dist = euclidean_distances(X)

In [7]:
# Function to find most similar pair based on similarity or distance matrix
def find_most_similar(matrix, metric_name):
    indices = np.unravel_index(np.argmin(matrix + np.eye(len(matrix)) * 1e10), matrix.shape)
    print(f"Most similar documents based on {metric_name}: d{indices[0] + 1} and d{indices[1] + 1}")

In [8]:
# Display cosine similarity
print("Cosine Similarity Matrix:")
print(cos_sim)
find_most_similar(-cos_sim, "cosine similarity")

Cosine Similarity Matrix:
[[1.         0.30779351 0.        ]
 [0.30779351 1.         0.41039134]
 [0.         0.41039134 1.        ]]
Most similar documents based on cosine similarity: d2 and d3


In [9]:
# Display Jaccard distances
print("\nJaccard Distance Matrix:")
print(jaccard_distances)
find_most_similar(jaccard_distances, "Jaccard distance")


Jaccard Distance Matrix:
[[0.   0.75 1.  ]
 [0.75 0.   1.  ]
 [1.   1.   0.  ]]
Most similar documents based on Jaccard distance: d1 and d2


In [10]:
# Display Euclidean distances
print("\nEuclidean Distance Matrix:")
print(euclidean_dist)
find_most_similar(euclidean_dist, "Euclidean distance")


Euclidean Distance Matrix:
[[0.         4.24264069 3.16227766]
 [4.24264069 0.         4.        ]
 [3.16227766 4.         0.        ]]
Most similar documents based on Euclidean distance: d1 and d3
