In [None]:
%pwd

In [None]:
%cd /Users/mukulagarwal/Desktop/Projects/customer_support_system

In [None]:
import pandas as pd

data = pd.read_csv("data/flipkart_product_review.csv")

In [None]:
data.head()

In [None]:
data['review']

In [None]:
data.columns

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# options = Options()
# options.add_argument('--headless')  # run in background
# driver = webdriver.Chrome(options=options)

def extract_product_name(url):
    options = Options()
    options.add_argument('--headless')  # run in background
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    try:
        # Amazon may take a moment to load the DOM
        product_link = driver.find_element(By.CSS_SELECTOR, 'a[data-hook="product-link"]')
        return product_link.text
    except:
        raise('NoTextElementFound')
    finally:
        driver.quit()

In [None]:
from datasets import load_dataset

data = load_dataset("XANJEEV/amazon-product-reviews",split='train')

In [None]:
%pwd

In [None]:
import pandas as pd
data = pd.read_csv("/Users/mukulagarwal/Desktop/Projects/customer_support_system/train.csv")

In [None]:
data

In [None]:
extract_product_name(data.iloc[0,2])

In [None]:
from urllib.parse import urlparse, parse_qs

def extract_asin_from_review_url(url):
    """
    Extract ASIN (product_id) from Amazon review URL
    
    Args:
        url (str): Amazon review URL
        
    Returns:
        str: ASIN or None if not found
    """
    try:
        # Parse the URL
        parsed_url = urlparse(url)
        
        # Extract query parameters
        query_params = parse_qs(parsed_url.query)
        
        # Get ASIN from query parameters
        asin = query_params.get('ASIN', [None])[0]
        
        return asin
        
    except Exception as e:
        print(f"Error parsing URL: {e}")
        return None

def get_product_id(review_url):
    """
    Simple function to get product_id from review URL
    
    Args:
        review_url (str): Amazon review URL
        
    Returns:
        str: Product ID (ASIN)
    """
    return extract_asin_from_review_url(review_url)

# Test function
def test_extraction():
    """Test the ASIN extraction with sample URLs"""
    
    test_urls = [
        "https://www.amazon.com/gp/customer-reviews/R32JEDU4MUFXFU/ref=cm_cr_arp_d_rvw_ttl?ie=UTF8&ASIN=B0CHX7R6WJ",
        "https://www.amazon.com/gp/customer-reviews/R12345ABCDEF/ref=cm_cr_arp_d_rvw_ttl?ie=UTF8&ASIN=B08N5WRWNW",
        "https://www.amazon.com/gp/customer-reviews/RABCDEF123/ref=cm_cr_arp_d_rvw_ttl?ASIN=B0CHX7R6WJ&ie=UTF8"
    ]
    
    print("Testing ASIN extraction:")
    print("-" * 50)
    
    for url in test_urls:
        asin = extract_asin_from_review_url(url)
        print(f"URL: {url[:60]}...")
        print(f"ASIN: {asin}")
        print("-" * 50)

def extract_asin_from_dataframe(df, url_column='url', output_column='product_id'):
    """
    Extract ASIN (product_id) from URLs in a pandas DataFrame
    
    Args:
        df (pandas.DataFrame): DataFrame containing URLs
        url_column (str): Name of column containing URLs (default: 'url')
        output_column (str): Name of new column for product_id (default: 'product_id')
        
    Returns:
        pandas.DataFrame: DataFrame with added product_id column
    """
    import pandas as pd
    
    # Create a copy to avoid modifying original
    df_copy = df.copy()
    
    # Apply the extraction function to the URL column
    df_copy[output_column] = df_copy[url_column].apply(extract_asin_from_review_url)
    
    return df_copy

def batch_extract_asin(urls_list):
    """
    Extract ASINs from a list of URLs
    
    Args:
        urls_list (list): List of Amazon review URLs
        
    Returns:
        list: List of ASINs corresponding to input URLs
    """
    return [extract_asin_from_review_url(url) for url in urls_list]

In [None]:
import pandas as pd

df_with_product_id = extract_asin_from_dataframe(data, url_column='review_url', output_column='product_id')
df_with_product_id

In [None]:
df_with_product_id['product_id'].value_counts()

In [None]:
B0CQ3VXJ3J : "SAMSUNG 16 Galaxy Book4 Pro Laptop PC Computer, Intel Core 7 Ultra Processor 1TB, 3K AMOLED (2880 x 1800) Touchscreen, Advanced Security, 2024 Model, NP960XGK-KG1US, Moonstone Gray"
B0B4MWCFV4 : "Fitbit Versa 4 Fitness Smartwatch with Daily Readiness, GPS, 24/7 Heart Rate, 40+ Exercise Modes, Sleep Tracking and more, Black/Graphite, One Size (S & L Bands Included)"
B0DLHTTWVB : "Apple 2024 MacBook Air 13-inch Laptop with M3 chip: Built for Apple Intelligence, 13.6-inch Liquid Retina Display, 16GB Unified Memory, 256GB SSD Storage, Backlit Keyboard, Touch ID; Space Gray"
B0C33XXS56 : "Sony WF-1000XM5 The Best Truly Wireless Bluetooth Noise Canceling Earbuds & in-Ear Headphones with Alexa Built-in, Black"
B0CHX7R6WJ : "Apple Watch SE (2nd Gen) [GPS 40mm] Smartwatch with Starlight Aluminum Case with Starlight Sport Band S/M. Fitness & Sleep Tracker, Crash Detection, Heart Rate Monitor"
B0BN95FRW9 : "Apple iPhone 14 Pro, 128GB, Space Black - Unlocked (Renewed)"
B01D93Z89W : "Canon EOS Rebel T7 DSLR Camera Bundle w/ Canon EF-S 18-55mm f/3.5-5.6 is II Lens + 2pc SanDisk 64GB Memory Cards, Wide Angle Lens, Telephoto Lens, 3pc Filter Kit + Accessory Kit"

In [None]:
asin_to_product = {
    'B0CQ3VXJ3J': 'Samsung Galaxy Book4 Pro 16-inch Laptop',
    'B0B4MWCFV4': 'Fitbit Versa 4 Fitness Smartwatch',
    'B0DLHTTWVB': 'Apple MacBook Air 13-inch M3 (2024)',
    'B0C33XXS56': 'Sony WF-1000XM5 Wireless Noise Canceling Earbuds',
    'B0CHX7R6WJ': 'Apple Watch SE 2nd Gen 40mm GPS',
    'B0BN95FRW9': 'Apple iPhone 14 Pro 128GB (Renewed)',
    'B01D93Z89W': 'Canon EOS Rebel T7 DSLR Camera Bundle'
}

df_with_product_id['product_name'] = df_with_product_id['product_id'].map(asin_to_product)
df_with_product_id

In [None]:
df_with_product_id.to_csv("additional_train.csv")

In [None]:
import pandas as pd
df = pd.read_csv("additional_train.csv")

In [None]:
df = df[['product_id','product_name','rating','title','review']]
df.sample(frac=1).head()

In [None]:
df = df.rename(columns={'product_name':'product_title','title':'summary'})
df.head()

In [None]:
dff = pd.read_csv("/Users/mukulagarwal/Desktop/Projects/customer_support_system/data/flipkart_product_review.csv")
merge_df = pd.concat([df,dff])
merge_df.sample(frac=1).head()

In [None]:
len(merge_df)

In [None]:
merge_df.to_csv("flipkart_product_review.csv",index=False)

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import random

file_path = "/Users/mukulagarwal/Desktop/Projects/customer_support_system/data/flipkart_product_review.csv"

loader = CSVLoader(file_path=file_path,
                   content_columns=['review'],
                   metadata_columns=['product_title','rating','summary'],
                   source_column='product_id')
docs = loader.load()

random.shuffle(docs)

for record in docs[:2]:
    print(record)

In [None]:
docs

In [None]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [None]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [None]:
from utils.model_loader import ModelLoader
from langchain.prompts import ChatPromptTemplate

model_loader = ModelLoader()
question_generation_llm = model_loader.load_llm()

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [None]:
qa_template = """\
You are a University Professor creating a test for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [None]:
for k, v in output_dict.items():
  print(k)
  print(v)

In [None]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:30]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

In [None]:
qac_triples[5]

In [None]:
answer_generation_llm = model_loader.load_llm()

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a Electronic Devices Salesperson and Expert - answering questions asked by customers to help them take a buy decision. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [None]:
for k, v in output_dict.items():
  print(k)
  print(v)

In [None]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

In [None]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})

eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [None]:
ground_truth_qac_set

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from retriever.retrieval import Retriever
from utils.model_loader import ModelLoader
from prompt_library.prompt import PROMPT_TEMPLATES
from dotenv import load_dotenv

load_dotenv()

retriever_obj = Retriever()

model_loader = ModelLoader()

def invoke_chain(query:str):
    
    retriever=retriever_obj.load_retriever()
    prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATES["product_bot"])
    llm= model_loader.load_llm()
    
    chain=(
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    
    )
    
    context = retriever_obj.call_retriever(query)
    
    output=chain.invoke(query)
    
    return output,context

In [None]:
eval_dataset['question'][1]

In [None]:
a,b = invoke_chain(eval_dataset['question'][0])
b

In [None]:
a

In [None]:
[cnt.page_content for cnt in b]

In [None]:
def create_ragas_dataset(eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer,context = invoke_chain(row["question"])
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer,
         "contexts" : [cnt.page_content for cnt in context],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

In [None]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(eval_dataset)

In [None]:
rag_test_data = pd.DataFrame(basic_qa_ragas_dataset)

In [None]:
rag_test_data.to_csv("rag_test_data.csv",index=False)

In [32]:
import pandas as pd
rag_test = pd.read_csv("/Users/mukulagarwal/Desktop/Projects/customer_support_system/rag_test_data.csv")
rag_test

Unnamed: 0,question,answer,contexts,ground_truths
0,What specific features of the BoAt BassHeads10...,There seems to be some confusion. The provided...,['nice product.. color is exactly shown in th...,"[""The reviewer appreciates the sound quality a..."
1,What is the approximate standby time of the On...,"According to one of the reviews, the standby t...","[""Looking nice product...I got just 1day deliv...",['The approximate standby time of the OnePlus ...
2,What specific feature of the U&I Titanic Serie...,"Unfortunately, I couldn't find any information...","['Awesome sound and bass. Love this product.',...",['The specific feature of the U&I Titanic Seri...
3,How did the reviewer initially feel about the ...,I couldn't find any information related to the...,"[""Don't hesitate to buy that green color buds!...",['The reviewer initially felt frustrated with ...
4,What method of exporting pictures from the Can...,The provided documents do not mention the Cano...,"[""I'm loving this bluetooth ☺️😍"", 'NICE WONDER...",['Using the memory card and an adapter for the...
5,What steps did the customer take to verify tha...,"Based on the provided reviews, I couldn't find...",['i purchased this headphones about 1 yr ago 1...,"[""The customer took several steps to verify th..."
6,What specific feature of the BoAt Airdopes131 ...,The specific feature that led to strong dissat...,"['Worst product ever bought. Instead, you can ...","[""The specific feature that led to the reviewe..."
7,What specific feature of the U&I Titanic Serie...,"Based on the reviews, the specific feature tha...","[""I am very happy with this product. The U&I T...",['The reviewer considers the battery backup of...
8,What specific feature of the OnePlus Bullets W...,The reviewer is particularly fond of the **cry...,"[""Looking nice product...I got just 1day deliv...",['The reviewer is particularly fond of the blu...
9,What specific health and fitness tracking feat...,The provided documents do not mention the Appl...,"['Good battery back up, sound should be improv...",['The reviewer misses the real-time heart rate...


In [33]:
rag_test = rag_test.rename(columns={
    'question' : 'user_input',
    'contexts' : 'retrieved_contexts',
    'answer' : 'response',
    'ground_truths' : 'reference'
})

rag_test.head()

Unnamed: 0,user_input,response,retrieved_contexts,reference
0,What specific features of the BoAt BassHeads10...,There seems to be some confusion. The provided...,['nice product.. color is exactly shown in th...,"[""The reviewer appreciates the sound quality a..."
1,What is the approximate standby time of the On...,"According to one of the reviews, the standby t...","[""Looking nice product...I got just 1day deliv...",['The approximate standby time of the OnePlus ...
2,What specific feature of the U&I Titanic Serie...,"Unfortunately, I couldn't find any information...","['Awesome sound and bass. Love this product.',...",['The specific feature of the U&I Titanic Seri...
3,How did the reviewer initially feel about the ...,I couldn't find any information related to the...,"[""Don't hesitate to buy that green color buds!...",['The reviewer initially felt frustrated with ...
4,What method of exporting pictures from the Can...,The provided documents do not mention the Cano...,"[""I'm loving this bluetooth ☺️😍"", 'NICE WONDER...",['Using the memory card and an adapter for the...


In [34]:
import ast 
rag_test['retrieved_contexts'] = rag_test['retrieved_contexts'].apply(ast.literal_eval)
rag_test['reference'] = rag_test['reference'].apply(lambda x : ast.literal_eval(x)[0])

In [35]:
rag_test.iloc[0,3]

"The reviewer appreciates the sound quality and bass of the BoAt BassHeads100 Wired Headset, considering it to be good for the cost. However, they gave a rating of 3 out of 5 and a 'Fair' summary, implying that there might be some aspects that did not meet their expectations."

In [36]:
from ragas import EvaluationDataset
evaluation_dataset = EvaluationDataset.from_pandas(rag_test)

In [37]:
evaluation_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=30)

In [38]:
from utils.model_loader import ModelLoader

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, LLMContextPrecisionWithReference
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

model_loader = ModelLoader()

evaluator_llm = model_loader.load_llm()
evaluator_llm = LangchainLLMWrapper(evaluator_llm)

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
       LLMContextRecall(), Faithfulness(), FactualCorrectness(), LLMContextPrecisionWithReference()
    ],
    llm=evaluator_llm
  )
  return result

LLM loading...


In [None]:
evaluate_ragas_dataset(evaluation_dataset)

Evaluating:  21%|██        | 25/120 [01:55<07:03,  4.46s/it]