In [3]:
!pip install spacy pandas scikit-learn
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import pandas as pd
df = pd.read_csv("Reviews.csv")
df = df.dropna(subset=['Text'])
df = df.head(1000)
df = df[['Id', 'ProductId', 'UserId', 'ProfileName', 'Score', 'Time', 'Summary', 'Text']]


In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
  doc = nlp(text.lower())
  tokens =[token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
  return " ".join(tokens)
df['Cleaned_Text']=df['Text'].apply(preprocess)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer =TfidfVectorizer()
X=vectorizer.fit_transform(df['Cleaned_Text'])

In [9]:
def preprocess_query(query):
  return preprocess(query)
def vectorize_query(query):
  cleaned_query = preprocess_query(query)
  return vectorizer.transform([cleaned_query])

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
def retrieve_similar_reviews(query, top_k=5):
    query_vec = vectorize_query(query)
    similarities = cosine_similarity(query_vec, X).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_reviews = df.iloc[top_indices]
    top_scores = similarities[top_indices]
    results = []
    for i, row in enumerate(top_reviews.itertuples()):
        results.append({
            'Score': row.Score,
            'Summary': row.Summary,
            'Review': row.Text,
            'Similarity': round(top_scores[i], 3)
        })
    return results


In [11]:
query = "great product with fast shipping"
matches = retrieve_similar_reviews(query)
for i, match in enumerate(matches, 1):
    print(f"\nMatch #{i}")
    print(f"Similarity: {match['Similarity']}")
    print(f"Score: {match['Score']}")
    print(f"Summary: {match['Summary']}")
    print(f"Review: {match['Review']}")



Match #1
Similarity: 0.4
Score: 5
Summary: Oyster Sauce
Review: Use frequently as we like to do Asian dishes at least once a week.  Love this product.  Fast shipping, as usual.  Would buy again.

Match #2
Similarity: 0.383
Score: 5
Summary: Healthy Stuff
Review: This stuff is great because it's low glycemic. Substitute this to sugar and you'll be doing your body a great favor.  This size is economical and shipping is fast, too.  I got mine very soon.

Match #3
Similarity: 0.34
Score: 5
Summary: delicious
Review: good products and fast shipping equals a happy me. a little pricey but you can hand pick a few good flavors...a few i cant find anywhere else so def worth the price

Match #4
Similarity: 0.317
Score: 5
Summary: Box-o-Chips
Review: These chips were a great deal.  Kettle was having a special deal for the case of chips.  They are hard to find in Vegas and are my favorite flavor. Shipping was very fast, sooner than expected.

Match #5
Similarity: 0.287
Score: 5
Summary: A Great Al