In [1]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download the Reuters corpus if not already downloaded
nltk.download('reuters')

# Load documents from the Reuters corpus
documents = reuters.fileids()

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform([reuters.raw(doc_id) for doc_id in documents])

# Function to retrieve top n similar documents based on cosine similarity
def retrieve_similar_documents(query, n=5):
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    ranked_documents = [(score, doc) for score, doc in zip(similarity_scores[0], documents)]
    ranked_documents.sort(reverse=True)
    return ranked_documents[:n]

# Example usage
query = "Oil prices surge due to geopolitical tensions"
similar_documents = retrieve_similar_documents(query)

# Print top 5 similar documents
for i, (score, doc_id) in enumerate(similar_documents, start=1):
    print(f"{i}. Document ID: {doc_id}, Similarity Score: {score:.4f}")
    print(reuters.raw(doc_id)[:500])  # Display first 500 characters of the document
    print()


[nltk_data] Downloading package reuters to /root/nltk_data...


1. Document ID: test/19285, Similarity Score: 0.1692
U.S. OIL PRICES STRONG AHEAD OF OPEC MEETING
  U.S. crude oil prices are at their
  highest level in more than a year ahead of next week's OPEC
  meeting, even though most industry analysts do not expect any
  policy changes from the session.
      They said prices, which have steadily climbed since the
  organization's accord in December, have risen on technical
  factors within the market and concerns about supplies because
  of the Iran-Iraq war, which could disrupt deliveries from the
  Gulf

2. Document ID: training/127, Similarity Score: 0.1676
DIAMOND SHAMROCK (DIA) CUTS CRUDE PRICES
  Diamond Shamrock Corp said that
  effective today it had cut its contract prices for crude oil by
  1.50 dlrs a barrel.
      The reduction brings its posted price for West Texas
  Intermediate to 16.00 dlrs a barrel, the copany said.
      "The price reduction today was made in the light of falling
  oil product prices and a weak crude oil mark