In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load dataset and NLTK

In [5]:
file_path = "data/Reviews.csv"  
df = pd.read_csv(file_path)

df = df[['Text']].dropna()

df = df.head(10000).reset_index(drop=True)


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Murali krishna
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Murali krishna
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Murali krishna
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


PREPROCESSING FUNCTION

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    

    tokens = nltk.word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

In [7]:
df['cleaned_text'] = df['Text'].apply(preprocess_text)

TF-IDF the reviews

In [8]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

Query processing

In [9]:
def process_query(query):
    return preprocess_text(query)

def retrieve_reviews(query, top_k=5):
    clean_query = process_query(query)
    query_vector = vectorizer.transform([clean_query])
    

    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    

    top_indices = similarities.argsort()[::-1][:top_k]
    
    results = df.iloc[top_indices][['Text', 'cleaned_text']].copy()
    results['similarity'] = similarities[top_indices]
    
    return results

SAMPLE query

In [13]:
queries = [
    "great product with fast shipping",
    "disappointed"
]

for query in queries:
    print(f"\n🔍 QUERY: {query}")
    
    results = retrieve_reviews(query, top_k=2)
    for i, row in results.iterrows():
        print(f"🟢 Similarity: {row['similarity']:.3f}")
        print(f"Original Review:\n{row['Text']}\n")
        print(f"Cleaned Review:\n{row['cleaned_text']}")



🔍 QUERY: great product with fast shipping
🟢 Similarity: 0.502
Original Review:
Enjoyed the product and they also provided very fast shipping.  I'm about out and need to order more.

Cleaned Review:
enjoyed product also provided fast shipping im need order
🟢 Similarity: 0.444
Original Review:
The tea is good and fresh. We enjoy it. The shipping is fast and cost is reasonable.<br />I recommend this product.

Cleaned Review:
tea good fresh enjoy shipping fast cost reasonablebr recommend product

🔍 QUERY: disappointed
🟢 Similarity: 0.485
Original Review:
I am a bit disappointed.  The flavor was not what I wanted or expected.

Cleaned Review:
bit disappointed flavor wanted expected
🟢 Similarity: 0.357
Original Review:
The product is very good. Way too expensive and at almost 5.00 a box to get pancakes or biscuits I was disappointed

Cleaned Review:
product good way expensive almost box get pancakes biscuits disappointed
