In [None]:
import re
import math

# Function to tokenize and preprocess text
def preprocess_text(text):
    # Tokenize and convert to lowercase
    words = re.findall(r'\w+', text.lower())
    return words

# Function to calculate the term frequency (TF) for each word in a document
def calculate_tf(text):
    word_count = {}
    words = preprocess_text(text)
    for word in words:
        word_count[word] = word_count.get(word, 0) + 1
    return word_count

# Function to calculate the inverse document frequency (IDF) for a set of documents
def calculate_idf(documents):
    idf = {}
    total_documents = len(documents)
    for document in documents:
        words = set(preprocess_text(document))
        for word in words:
            idf[word] = idf.get(word, 0) + 1

    for word, count in idf.items():
        idf[word] = math.log(total_documents / (count + 1))

    return idf

# Function to calculate the cosine similarity between two documents
def calculate_cosine_similarity(doc1, doc2, idf):
    tf1 = calculate_tf(doc1)
    tf2 = calculate_tf(doc2)

    # Calculate the dot product
    dot_product = 0
    for word in set(tf1.keys()) & set(tf2.keys()):
        dot_product += tf1[word] * tf2[word] * idf[word] ** 2

    # Calculate the magnitude of vectors
    magnitude1 = math.sqrt(sum((tf1[word] * idf[word]) ** 2 for word in tf1))
    magnitude2 = math.sqrt(sum((tf2[word] * idf[word]) ** 2 for word in tf2))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # Avoid division by zero

    return dot_product / (magnitude1 * magnitude2)

# Example documents
document1 = "This is the first document."
document2 = "Here is another document."
document3 = "A third example document."
document4 = "One more document for good measure."

documents = [document1, document2, document3, document4]

# Calculate IDF values
idf = calculate_idf(documents)

# Calculate and print the cosine similarity between all pairs of documents
similarities = []
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        similarity = calculate_cosine_similarity(documents[i], documents[j], idf)
        similarities.append((i, j, similarity))

similarities.sort(key=lambda x: x[2], reverse=True)

for i, j, similarity in similarities:
    print(f"Similarity between Document {i+1} and Document {j+1}: {similarity:.4f}")


Similarity between Document 1 and Document 2: 0.1010
Similarity between Document 2 and Document 3: 0.0390
Similarity between Document 1 and Document 3: 0.0325
Similarity between Document 2 and Document 4: 0.0304
Similarity between Document 3 and Document 4: 0.0260
Similarity between Document 1 and Document 4: 0.0253


In [None]:
import scipy.spatial.distance as dist
dist.chebyshev([1,0,2,3,2,4], [2,1,0,2,1,0])

4