## Simple Implementation

In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "Bright sun shining over the hills",
    "She exercises every morning",
    "His dog barks loudly",
    "We always eat dinner together",
    "This little black dress isn’t expensive",
]

# Preprocessing and Vectorization
def preprocess(text):
    # Tokenize, lowercase, remove stop words, and stem/lemmatize
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

preprocessed_docs = [preprocess(doc) for doc in documents]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)

# Query Processing
query = "Quick brown fox"
preprocessed_query = preprocess(query)
query_vec = vectorizer.transform([preprocessed_query])

# Ranking
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
ranked_doc_indices = cosine_similarities.argsort()[::-1]

# Display Results
for index in ranked_doc_indices:
    print(f"Document {index + 1}: {documents[index]} (Score: {cosine_similarities[index]:.4f})")


Document 1: The quick brown fox jumps over the lazy dog (Score: 0.7839)
Document 7: This little black dress isn’t expensive (Score: 0.0000)
Document 6: We always eat dinner together (Score: 0.0000)
Document 5: His dog barks loudly (Score: 0.0000)
Document 4: She exercises every morning (Score: 0.0000)
Document 3: Bright sun shining over the hills (Score: 0.0000)
Document 2: Never jump over the lazy dog quickly (Score: 0.0000)
