In [7]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

nltk.download('stopwords')

df = pd.read_csv('TestReviews.csv')
reviews = df['review'].dropna().head(10000).reset_index(drop=True)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

cleaned_reviews = reviews.apply(preprocess_text)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

def retrieve_reviews(query, tfidf_matrix=tfidf_matrix, vectorizer=vectorizer, original_reviews=reviews, cleaned_reviews=cleaned_reviews, top_k=5):
    query_clean = preprocess_text(query)
    query_vec = vectorizer.transform([query_clean])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    results = []
    for idx in top_indices:
        results.append({
            'original_review': original_reviews[idx],
            'cleaned_review': cleaned_reviews[idx],
            'similarity_score': similarities[idx]
        })
    return results

queries = ["great product with fast shipping", "disappointed"]

for q in queries:
    print(f"\nQuery: '{q}'")
    top_reviews = retrieve_reviews(q)
    for i, review in enumerate(top_reviews):
        print(f"\nTop {i+1} Review (Score: {review['similarity_score']:.3f}):")
        print("Original:", review['original_review'])
        print("Cleaned: ", review['cleaned_review'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Query: 'great product with fast shipping'

Top 1 Review (Score: 0.338):
Original: Fast friendly service.

Cleaned:  fast friendly service

Top 2 Review (Score: 0.309):
Original: I found an LP I was looking for at a very fair price on their website. After paying $15.00 shipping, it took several weeks to receive since they only put $4.22 postage on it to cover the media mail charge. Plus they shipped it 2 weeks after I paid. So they overcharged me $10 for shipping AND dragged their feet in shipping it. I would avoid using them for any type of mail order. Says a lot about their business ethics, or lack thereof.

Cleaned:  found lp looking fair price website paying 1500 shipping took several weeks receive since put 422 postage cover media mail charge plus shipped 2 weeks paid overcharged 10 shipping dragged feet shipping would avoid using type mail order says lot business ethics lack thereof

Top 3 Review (Score: 0.305):
Original: Very extensive product knowledge & selection.  Tim & Brend