In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = [
    "I am happy today",
    "I have two cats",
    "I hate cats",
    "cats are clever than dogs",
]

# Query
query = "How many cats do I have?"

# Combine the query with the documents for TF-IDF vectorization
documents.append(query)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Calculate cosine similarity between the query and documents
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Sort documents by similarity scores
result_indices = cosine_similarities.argsort()[::-1]

# Display the ranked documents based on similarity scores
print("Query:", query)
print("\nSearch Results:")
for i, index in enumerate(result_indices):
    print(f"{i + 1}. Document {index + 1}: {documents[index]}")
    print(f"   Cosine Similarity: {cosine_similarities[index]:.4f}\n")


Query: How many cats do I have?

Search Results:
1. Document 2: I have two cats
   Cosine Similarity: 0.3253

2. Document 3: I hate cats
   Cosine Similarity: 0.1041

3. Document 1: Cats are lovely
   Cosine Similarity: 0.0841

4. Document 4: cats are clever than dogs
   Cosine Similarity: 0.0586



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Step 1: Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Step 2: Compute Cosine Similarity
query = "This is the second document."
query_vector = tfidf_vectorizer.transform([query])

cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Step 3: Display Results
sorted_similarities = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

for index, similarity in sorted_similarities:
    print(f"Document {index + 1}: Similarity = {similarity:.4f}")
    print(f"   {documents[index]}")
    print()


Document 2: Similarity = 0.9505
   This document is the second document.

Document 1: Similarity = 0.6042
   This is the first document.

Document 4: Similarity = 0.6042
   Is this the first document?

Document 3: Similarity = 0.2804
   And this is the third one.

