IMPORT LIBRARIES

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load SpaCy english Model

In [None]:
nlp = spacy.load("en_core_web_sm")

Loading the dataSet

In [None]:
df = pd.read_csv("Reviews.csv", on_bad_lines="skip")
df = df.head(1000)
df = df.dropna(subset=["Text"])
print(df.head(5))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

Preprocess

In [None]:
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop
    ]
    return " ".join(tokens)
df["cleaned_text"] = df["Text"].apply(preprocess)

Convert Reviews to TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["cleaned_text"])

Process User Query

In [None]:
def process_query(query, top_k=5):
    cleaned_query = preprocess(query)
    query_vec = vectorizer.transform([cleaned_query])


    sim_scores = cosine_similarity(query_vec, X).flatten()
    top_indices = sim_scores.argsort()[-top_k:][::-1]

    return df.iloc[top_indices][["Score", "Text"]]

EXAMPLE USAGE

In [None]:
results = process_query("great battery life and fast performance", top_k=5)
print(results)

     Score                                               Text
27       4  I was so glad Amazon carried these batteries. ...
849      5  I recently started using Bob's Red Mill Baking...
239      5  Fresh,a great way to get a little chocolate in...
492      5  A friend of mine brought these into work with ...
452      5  This was the best deal ever. The delivery was ...
