In [1]:
!pip install pandas numpy nltk scikit-learn



In [2]:
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt', download_dir='/usr/local/nltk_data')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/local/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
df=pd.read_csv('Reviews.csv')
reviews=df['Text']
reviews=reviews.dropna()
reviews=reviews[:10000]
print(reviews.head())

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object


In [6]:
stop_words=set(stopwords.words('english'))

In [21]:
import re
def preprocess_text(text):
  text=text.lower()
  text=re.sub(r'[^a-zA-Z0-9\s]','',text)
  #tokens=word_tokenize(text)
  tokens=text.split()
  tokens=[word for word in tokens if word not in stop_words]
  return " ".join(tokens)

In [22]:
cleaned_reviews=reviews.apply(preprocess_text)
print(cleaned_reviews.head())

0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanutsth...
2    confection around centuries light pillowy citr...
3    looking secret ingredient robitussin believe f...
4    great taffy great price wide assortment yummy ...
Name: Text, dtype: object


In [24]:
vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(cleaned_reviews)
print(tfidf_matrix.shape)

(10000, 24715)


In [29]:
def retrieve_similar_reviews(query,top_k=5):
  cleaned_query = preprocess_text(query)
  query_vector=vectorizer.transform([cleaned_query])
  similarity_scores = cosine_similarity(query_vector,tfidf_matrix).flatten()
  top_indices=similarity_scores.argsort()[-top_k:][::-1]
  results=pd.DataFrame({'Original Review': reviews.iloc[top_indices].values,
        'Cleaned Review': cleaned_reviews.iloc[top_indices].values,
        'Similarity Score': similarity_scores[top_indices]
  })
  return results

In [30]:
query1 = "great product with fast shipping"
print("\nTop Matches for Query 1:")
print(retrieve_similar_reviews(query1))
query2 = "disappointed"
print("\nTop Matches for Query 2:")
print(retrieve_similar_reviews(query2))



Top Matches for Query 1:
                                     Original Review  ... Similarity Score
0  Enjoyed the product and they also provided ver...  ...         0.502383
1  The tea is good and fresh. We enjoy it. The sh...  ...         0.443917
2  My daughter lives in Hawaii and sent me some g...  ...         0.405396
3  The energy drink is a great product. The shipp...  ...         0.403060
4  Fast shipping, items were packaged nicely and ...  ...         0.390348

[5 rows x 3 columns]

Top Matches for Query 2:
                                     Original Review  ... Similarity Score
0  I am a bit disappointed.  The flavor was not w...  ...         0.484993
1  Disappointed.  The big boxes had a very differ...  ...         0.320797
2  Just plain nasty!!! This item tasted like card...  ...         0.311343
3  The product is very good. Way too expensive an...  ...         0.310343
4  this stuff really works, i love it and cant ge...  ...         0.308053

[5 rows x 3 columns]
