## Importations

In [5]:
import pandas as pd
from openai import OpenAI
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

## Setting up the Environment

In [6]:

from dotenv import load_dotenv

load_dotenv()

True

In [7]:


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


## Data Loading

In [8]:
data = pd.read_csv("data_cleaned.csv")

In [9]:
data.head()


Unnamed: 0,review_id,product,category,rating,review_text,feature_mentioned,attribute_mentioned,date,sentiment,tagged_reviews
0,REV14165,TechPro X20,Smartphones,4,The TechPro X20 is amazing! facial recognition...,facial recognition,design,2023-03-09,positive,REV14165 The TechPro X20 is amazing! facial re...
1,REV81426,TechPro X20,Smartphones,4,The TechPro X20 is amazing! battery life works...,battery life,design,2023-10-28,positive,REV81426 The TechPro X20 is amazing! battery l...
2,REV54597,SmartWatch Pro,Wearables,4,The SmartWatch Pro is amazing! app integration...,app integration,durability,2023-03-09,positive,REV54597 The SmartWatch Pro is amazing! app in...
3,REV89131,SmartSpeaker 360,Smart Home,4,Very impressed with the SmartSpeaker 360. Grea...,app interface,connectivity,2022-12-19,positive,REV89131 Very impressed with the SmartSpeaker ...
4,REV92397,HomeConnect Hub,Smart Home,4,Very impressed with the HomeConnect Hub. Great...,device compatibility,design,2022-10-28,positive,REV92397 Very impressed with the HomeConnect H...


In [10]:
data["tagged_reviews"]


Unnamed: 0,tagged_reviews
0,REV14165 The TechPro X20 is amazing! facial re...
1,REV81426 The TechPro X20 is amazing! battery l...
2,REV54597 The SmartWatch Pro is amazing! app in...
3,REV89131 Very impressed with the SmartSpeaker ...
4,REV92397 Very impressed with the HomeConnect H...
...,...
995,REV62665 Would not recommend this Wearables. T...
996,REV91048 Would not recommend this Laptops. The...
997,REV15997 Best Wearables I've ever owned. The R...
998,REV72406 Absolutely love my new SecurityCam Pr...


In [11]:
data["tagged_reviews"].to_csv("tagged_reviews.txt",
                                   sep = "\n",
                                   index = False,
                                   header = False)

## Splitting the data into chunks

In [12]:
raw_documents = TextLoader("tagged_reviews.txt").load()
# Split the data
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0,separator="\n")
documents = text_splitter.split_documents(raw_documents)



In [13]:
documents[0]


Document(metadata={'source': 'tagged_reviews.txt'}, page_content='REV14165 The TechPro X20 is amazing! facial recognition works perfectly and the design is outstanding. Would definitely recommend!')

## Adding the Chunks into a Vector Database,and Generating Embeddings

In [14]:
db_reviews = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings(),
)

## Testing Retrieval

In [15]:
query = "WHich is the best product for music"
docs = db_reviews.similarity_search(query, k =5)
docs

[Document(id='20c2ca72-c59f-4a0a-b811-30db994eae80', metadata={'source': 'tagged_reviews.txt'}, page_content="REV42208 Best Audio I've ever owned. The AudiophileMax has exceeded my expectations in terms of portability."),
 Document(id='8d8bd2d6-ef84-42ae-bfe9-788dbe250778', metadata={'source': 'tagged_reviews.txt'}, page_content="REV15530 Best Audio I've ever owned. The AudiophileMax has exceeded my expectations in terms of ease of use."),
 Document(id='fe3d6601-feba-44c9-80ab-34a34e5da31d', metadata={'source': 'tagged_reviews.txt'}, page_content="REV77181 Best Audio I've ever owned. The AudiophileMax has exceeded my expectations in terms of portability."),
 Document(id='31488081-c5ba-4ee9-a5e9-e345e7621be6', metadata={'source': 'tagged_reviews.txt'}, page_content='REV19965 The AudiophileMax is amazing! sound quality works perfectly and the portability is outstanding. Would definitely recommend!'),
 Document(id='714ff39b-43f5-4df8-8fac-5131dd1b4217', metadata={'source': 'tagged_reviews

In [16]:
data[data["review_id"] == docs[0].page_content.split()[0].strip()]


Unnamed: 0,review_id,product,category,rating,review_text,feature_mentioned,attribute_mentioned,date,sentiment,tagged_reviews
257,REV42208,AudiophileMax,Audio,4,Best Audio I've ever owned. The AudiophileMax ...,connectivity,portability,2023-08-07,positive,REV42208 Best Audio I've ever owned. The Audio...


## Putting it all together

In [19]:
def retrieve_semantic_reviews(
    query: str,
    df: pd.DataFrame,
    top_k: int = 10,
) -> pd.DataFrame:
    # 1. Ask Chroma for 50 candidates
    recs = db_reviews.similarity_search(query, k=50)

    # 2. Keep only the top_k of them
    recs = recs[:top_k]

    # 3. Extract IDs, coercing to the same type as data.review_id
    review_ids = []
    for r in recs:
        tok = r.page_content.strip('"').split()[0]
        try:
            review_ids.append(int(tok))
        except ValueError:
            review_ids.append(tok)

    # Filter AND preserve order
    filtered = (
        data
        .set_index("review_id")
        .loc[review_ids]
        .reset_index()
    )
    return filtered


In [20]:
retrieve_semantic_reviews("The highly recommended product",df=data)


Unnamed: 0,review_id,product,category,rating,review_text,feature_mentioned,attribute_mentioned,date,sentiment,tagged_reviews
0,REV89551,AudiophileMax,Audio,5,The AudiophileMax is amazing! microphone works...,microphone,ease of use,2023-06-15,positive,REV89551 The AudiophileMax is amazing! microph...
1,REV19965,AudiophileMax,Audio,5,The AudiophileMax is amazing! sound quality wo...,sound quality,portability,2022-12-17,positive,REV19965 The AudiophileMax is amazing! sound q...
2,REV13840,AudiophileMax,Audio,5,The AudiophileMax is amazing! battery life wor...,battery life,design,2022-08-13,positive,REV13840 The AudiophileMax is amazing! battery...
3,REV41385,AudiophileMax,Audio,4,The AudiophileMax is amazing! sound quality wo...,sound quality,ease of use,2022-11-11,positive,REV41385 The AudiophileMax is amazing! sound q...
4,REV55029,AudiophileMax,Audio,5,The AudiophileMax is amazing! sound quality wo...,sound quality,build quality,2022-02-24,positive,REV55029 The AudiophileMax is amazing! sound q...
5,REV10037,AudiophileMax,Audio,5,The AudiophileMax is amazing! noise cancellati...,noise cancellation,ease of use,2023-11-02,positive,REV10037 The AudiophileMax is amazing! noise c...
6,REV21053,ThermoControl Smart,Smart Home,5,The ThermoControl Smart is amazing! device com...,device compatibility,design,2022-12-18,positive,REV21053 The ThermoControl Smart is amazing! d...
7,REV72851,AudiophileMax,Audio,4,The AudiophileMax is amazing! microphone works...,microphone,comfort,2022-01-18,positive,REV72851 The AudiophileMax is amazing! microph...
8,REV24832,SecurityCam Pro,Smart Home,5,The SecurityCam Pro is amazing! setup process ...,setup process,design,2023-01-13,positive,REV24832 The SecurityCam Pro is amazing! setup...
9,REV12329,BassBoost Speaker,Audio,5,The BassBoost Speaker is amazing! sound qualit...,sound quality,design,2023-11-01,positive,REV12329 The BassBoost Speaker is amazing! sou...


## A function to Category Summary, based on the ratings and the Reviews

In [None]:
def generate_category_summary(df, category):
    """Generate a summary for a specific product category"""
    # 1. Filter data for the category
    category_data = df[df['category'] == category]

    # 1a. If there are no reviews, bail out immediately
    if category_data.empty:
        return f"No reviews found for category '{category}'."

    # 2. Sample up to 10 reviews
    sample_reviews = category_data.sample(min(10, len(category_data)))

    # 3. Build the prompt
    reviews_text = "\n\n".join(
        f"Rating: {row['rating']}, Review: {row['review_text']}"
        for _, row in sample_reviews.iterrows()
    )
    prompt = (
        f"Based on the following customer reviews for {category} products:\n\n"
        f"{reviews_text}\n\n"
        "Please provide:\n"
        "1. A concise summary of overall product performance\n"
        "2. Top praised features\n"
        "3. Common issues mentioned\n"
        "4. Suggestions for improvement\n"
    )

    # 4. Call the LLM
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error generating summary: {e}")
        return f"Error generating summary for {category}: {e}"


In [None]:
generate_category_summary(data,"Audio")

'1. Overall, the product performance of the Audio lineup is generally positive, with a majority of customers expressing satisfaction with their purchases.\n\n2. Top praised features include exceptional sound quality, ease of use, connectivity, build quality, noise cancellation capabilities, and microphone functionality.\n\n3. Common issues mentioned by customers include inconsistent sound quality, subpar design, and poor ease of use in some models.\n\n4. Suggestions for improvement could include addressing the issues related to sound quality, design flaws, and ease of use, as well as potentially enhancing noise cancellation capabilities in certain models.'

## A function to answer Product questions based on the dataset

In [17]:
from typing import Callable

def product_qa(
    question: str,
    df: pd.DataFrame,
    retrieve_fn: Callable[[str], pd.DataFrame],
    top_k: int = 5,
) -> str:
    """Answer questions about products based on relevant reviews."""
    #  Retrieve candidate reviews
    relevant = retrieve_fn(question)
    if relevant.empty:
        return "I couldn't find any reviews relevant to your question."

    # 2. Keep only the top_k most relevant (preserving order)
    top_reviews = relevant.head(top_k)

    #  prompt from the top reviews
    reviews_text = "\n\n".join(
        f"Product: {row['product']}, Rating: {row['rating']}, Review: {row['review_text']}"
        for _, row in top_reviews.iterrows()
    )

    # = system + user prompt to ground the LLM
    system_message = (
        "You are a precise product review assistant. "
        "Answer the user's question strictly based on the provided reviews."
    )
    user_prompt = (
        f"Here are some customer reviews about products:\n\n{reviews_text}\n\n"
        f"Question: {question}\n\nAnswer:"
    )

    # 5. Call the LLM and return its answer
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_prompt},
            ]
        )
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error in Q&A system: {e}")
        return f"Sorry, I couldn't answer your question due to an error: {e}"

In [21]:
answer = product_qa(
    question="Which product is the best for music",
    df=data,
    # and always uses your `data` DataFrame and top_k=2
    retrieve_fn=lambda q: retrieve_semantic_reviews(q, data, top_k=2),
    top_k=2,
)

print( answer)


Based on the provided reviews, the AudiophileMax is the best product for music as it offers excellent audio quality and exceeds customers' expectations in terms of portability and ease of use.
