In [1]:
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [None]:

documents = [reuters.raw(fileid) for fileid in reuters.fileids()]


In [None]:

def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

preprocessed_documents = [preprocess(doc) for doc in documents]


In [None]:

word2vec_model = Word2Vec(sentences=preprocessed_documents, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
def compute_average_embedding(text, model):
    
    words = preprocess(text)
    embeddings = [model.wv[word] for word in words if word in model.wv]

    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


In [None]:

query = "wheat prices"
query_embedding = compute_average_embedding(query, word2vec_model)


In [None]:

document_embeddings = [compute_average_embedding(" ".join(doc), word2vec_model) for doc in preprocessed_documents]

similarities = [cosine_similarity([query_embedding], [doc_embedding])[0][0] for doc_embedding in document_embeddings]


In [None]:

N = 5

top_n_indices = np.argsort(similarities)[-N:][::-1]


In [None]:

print("Top", N, "most relevant documents for query '", query, "':")

for i in top_n_indices:
    print(f"Document ID: {i}, Similarity Score: {similarities[i]:.4f}")
    

Top 5 most relevant documents for query ' wheat prices ':
Document ID: 4976, Similarity Score: 0.8656
Document ID: 814, Similarity Score: 0.8642
Document ID: 4549, Similarity Score: 0.8469
Document ID: 3808, Similarity Score: 0.8456
Document ID: 357, Similarity Score: 0.8444
