Importing the libraries 

In [4]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

Download NLTK resources

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suzan.awinat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Sample documents

In [6]:
documents = [
    "Introduction to Information Retrieval",
    "Vector Space Models in Practice",
    "Building Search Engines with Python",
    "Applications of Information Retrieval",
    "Understanding Cosine Similarity"
]

Preprocessing function

In [7]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization (optional)
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

Apply preprocessing to each document

In [11]:
preprocessed_documents = [preprocess_text(doc) for doc in documents]
print(preprocessed_documents)

['introduction information retrieval', 'vector space model practice', 'building search engine python', 'application information retrieval', 'understanding cosine similarity']


Create a TF-IDF Vectorizer

In [12]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

In [13]:
# User query
user_query = "Introduction to Python programming"
preprocessed_query = preprocess_text(user_query)

# Transform the user query using the same vectorizer
query_vector = vectorizer.transform([preprocessed_query])

Calculate cosine similarity between the query and documents

In [15]:
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

Rank documents based on similarity

In [16]:
document_ranks = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

# Display the ranked documents
print("Ranked Documents after Preprocessing:")
for index, similarity in document_ranks:
    print(f"Document {index + 1}: Similarity = {similarity:.4f} - '{documents[index]}'")


Ranked Documents after Preprocessing:
Document 1: Similarity = 0.4661 - 'Introduction to Information Retrieval'
Document 3: Similarity = 0.3536 - 'Building Search Engines with Python'
Document 2: Similarity = 0.0000 - 'Vector Space Models in Practice'
Document 4: Similarity = 0.0000 - 'Applications of Information Retrieval'
Document 5: Similarity = 0.0000 - 'Understanding Cosine Similarity'
