In [2]:
import math

def calculate_idf(N, df):
    return math.log10(N / df)

def calculate_idfa(tf):
    return 1 + math.log10(tf) if tf > 0 else 1

def calculate_tf_idf(tf, idf):
    return tf * idf

def calculate_norm(weights):
    return math.sqrt(sum([w**2 for w in weights]))

print("\033[1mOutput :\033[0m")

# Function to compute cosine similarity
def cosine_similarity(query, document, N, df_query):
    # Step 1: Calculate IDF for each term
    idf_query = {term: calculate_idf(N, df_query.get(term, 1)) for term in query}
    idf_document = {term: calculate_idfa(document.get(term, 0)) for term in document}

    # Step 2: Calculate TF-IDF for the query and document
    tf_idf_query = {term: calculate_tf_idf(query[term], idf_query[term]) for term in query}

    # Step 3: Normalize the TF-IDF values (Euclidean normalization)
    norm_query = calculate_norm(list(tf_idf_query.values()))
    norm_document = calculate_norm(list(idf_document.values()))

    normalized_query = {term: tf_idf_query[term] / norm_query for term in tf_idf_query}
    normalized_document = {term: idf_document[term] / norm_document for term in idf_document}

    similarity = sum([normalized_query.get(term, 0) * normalized_document.get(term, 0) for term in set(query).union(document)])

    return similarity

# New example query and documents
query = {'machine': 1, 'learning': 2, 'models': 1}

df_query = {'machine': 5000, 'learning': 8000, 'models': 12000}

N = 5000000

documents = [
    {'machine': 2, 'learning': 3, 'models': 1},
    {'machine': 1, 'learning': 1, 'models': 2},
    {'machine': 1, 'learning': 0, 'models': 3},
    {'machine': 0, 'learning': 2, 'models': 2},
    {'machine': 3, 'learning': 3, 'models': 0},
]

# Compute cosine similarity for each document with the query
for i, document in enumerate(documents, start=1):
    print(f"\033[1mDocument {i}:\033[0m")
    similarity_score = cosine_similarity(query, document, N, df_query)
    print(f"Cosine Similarity between the query and Document {i}: {similarity_score}")
    print("-" * 50)


[1mOutput :[0m
[1mDocument 1:[0m
Cosine Similarity between the query and Document 1: 0.975278769182648
--------------------------------------------------
[1mDocument 2:[0m
Cosine Similarity between the query and Document 2: 0.9096272263489482
--------------------------------------------------
[1mDocument 3:[0m
Cosine Similarity between the query and Document 3: 0.8876273950817448
--------------------------------------------------
[1mDocument 4:[0m
Cosine Similarity between the query and Document 4: 0.9517877007555453
--------------------------------------------------
[1mDocument 5:[0m
Cosine Similarity between the query and Document 5: 0.9629653749714072
--------------------------------------------------
